mirror of
https://https.git.savannah.gnu.org/git/gzip.git
synced 2026-01-27 01:44:23 +00:00
774 lines
24 KiB
C
774 lines
24 KiB
C
/* match.s -- optional optimized asm version of longest match in deflate.c
|
|
|
|
Copyright (C) 2002, 2006, 2009-2026 Free Software Foundation, Inc.
|
|
Copyright (C) 1992-1993 Jean-loup Gailly
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
|
|
|
/*
|
|
* The 68020 version has been written by Francesco Potortì <pot@cnuce.cnr.it>
|
|
* with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
|
|
* Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
|
|
* Kristoffer Eriksson <ske@pkmab.se>
|
|
*
|
|
* The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
|
|
* Unwind directives and some reformatting for better readability added by
|
|
* David Mosberger-Tang <davidm@hpl.hp.com>.
|
|
*/
|
|
|
|
/* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
|
|
* external symbols with an underline character '_'.
|
|
*/
|
|
#ifdef NO_UNDERLINE
|
|
# define _prev prev
|
|
# define _window window
|
|
# define _match_start match_start
|
|
# define _prev_length prev_length
|
|
# define _good_match good_match
|
|
# define _nice_match nice_match
|
|
# define _strstart strstart
|
|
# define _max_chain_length max_chain_length
|
|
|
|
# define _match_init match_init
|
|
# define _longest_match longest_match
|
|
#endif
|
|
|
|
#ifdef DYN_ALLOC
|
|
error: DYN_ALLOC not yet supported in match.s
|
|
#endif
|
|
|
|
/* On x86-64, Sun C 5.13 (Oracle Solaris Studio 12.4) 'cc -E -m64'
|
|
defines i386 when compiling .s or .S files! Luckily it also
|
|
defines __x86_64__. See Bug#23133. */
|
|
#if ((defined i386 || defined _I386 || defined __i386 || defined __i386__) \
|
|
&& !defined __x86_64__)
|
|
|
|
/* This version is for 386 Unix or OS/2 in 32 bit mode.
|
|
* Warning: it uses the AT&T syntax: mov source,dest
|
|
* This file is only optional. If you want to force the C version,
|
|
* add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
|
|
* If you have reduced WSIZE in gzip.h, then change its value below.
|
|
* This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
|
|
*/
|
|
|
|
.file "match.S"
|
|
|
|
#define MAX_MATCH 258
|
|
#define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */
|
|
#define MIN_MATCH 3
|
|
#define WSIZE $32768
|
|
#define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
|
|
|
|
.globl _match_init
|
|
.globl _longest_match
|
|
|
|
.text
|
|
|
|
_match_init:
|
|
ret
|
|
|
|
/*-----------------------------------------------------------------------
|
|
* Set match_start to the longest match starting at the given string and
|
|
* return its length. Matches shorter or equal to prev_length are discarded,
|
|
* in which case the result is equal to prev_length and match_start is
|
|
* garbage.
|
|
* IN assertions: cur_match is the head of the hash chain for the current
|
|
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
|
*/
|
|
|
|
_longest_match: /* int longest_match(cur_match) */
|
|
|
|
#define cur_match 20(%esp)
|
|
/* return address */ /* esp+16 */
|
|
push %ebp /* esp+12 */
|
|
push %edi /* esp+8 */
|
|
push %esi /* esp+4 */
|
|
push %ebx /* esp */
|
|
|
|
/*
|
|
* match equ esi
|
|
* scan equ edi
|
|
* chain_length equ ebp
|
|
* best_len equ ebx
|
|
* limit equ edx
|
|
*/
|
|
mov cur_match,%esi
|
|
mov _max_chain_length,%ebp /* chain_length = max_chain_length */
|
|
mov _strstart,%edi
|
|
mov %edi,%edx
|
|
sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */
|
|
jae limit_ok
|
|
sub %edx,%edx /* limit = NIL */
|
|
limit_ok:
|
|
add $2+_window,%edi /* edi = offset(window+strstart+2) */
|
|
mov _prev_length,%ebx /* best_len = prev_length */
|
|
movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
|
|
movw -2(%edi),%cx /* cx = scan[0..1] */
|
|
cmp _good_match,%ebx /* do we have a good match already? */
|
|
jb do_scan
|
|
shr $2,%ebp /* chain_length >>= 2 */
|
|
jmp do_scan
|
|
|
|
.align 4
|
|
long_loop:
|
|
/* at this point, edi == scan+2, esi == cur_match */
|
|
movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
|
|
movw -2(%edi),%cx /* cx = scan[0..1] */
|
|
short_loop:
|
|
/*
|
|
* at this point, di == scan+2, si == cur_match,
|
|
* ax = scan[best_len-1..best_len] and cx = scan[0..1]
|
|
*/
|
|
and WSIZE-1, %esi
|
|
movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */
|
|
/* top word of esi is still 0 */
|
|
cmp %edx,%esi /* cur_match <= limit ? */
|
|
jbe the_end
|
|
dec %ebp /* --chain_length */
|
|
jz the_end
|
|
do_scan:
|
|
cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
|
|
jne short_loop
|
|
cmpw _window(%esi),%cx /* check min_match_length match */
|
|
jne short_loop
|
|
|
|
lea _window+2(%esi),%esi /* si = match */
|
|
mov %edi,%eax /* ax = scan+2 */
|
|
mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */
|
|
rep; cmpsw /* loop until mismatch */
|
|
je maxmatch /* match of length MAX_MATCH? */
|
|
mismatch:
|
|
movb -2(%edi),%cl /* mismatch on first or second byte? */
|
|
subb -2(%esi),%cl /* cl = 0 if first bytes equal */
|
|
xchg %edi,%eax /* edi = scan+2, eax = end of scan */
|
|
sub %edi,%eax /* eax = len */
|
|
sub %eax,%esi /* esi = cur_match + 2 + offset(window) */
|
|
sub $2+_window,%esi /* esi = cur_match */
|
|
subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */
|
|
adc $0,%eax /* eax = carry ? len+1 : len */
|
|
cmp %ebx,%eax /* len > best_len ? */
|
|
jle long_loop
|
|
mov %esi,_match_start /* match_start = cur_match */
|
|
mov %eax,%ebx /* ebx = best_len = len */
|
|
cmp _nice_match,%eax /* len >= nice_match ? */
|
|
jl long_loop
|
|
the_end:
|
|
mov %ebx,%eax /* result = eax = best_len */
|
|
pop %ebx
|
|
pop %esi
|
|
pop %edi
|
|
pop %ebp
|
|
ret
|
|
maxmatch:
|
|
cmpsb
|
|
jmp mismatch
|
|
|
|
#else
|
|
|
|
/* ======================== 680x0 version ================================= */
|
|
|
|
#if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
|
|
# ifndef mc68000
|
|
# define mc68000
|
|
# endif
|
|
#endif
|
|
|
|
#if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
|
|
# ifndef mc68020
|
|
# define mc68020
|
|
# endif
|
|
#endif
|
|
|
|
#if defined(mc68020) || defined(mc68000)
|
|
|
|
#if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
|
|
# define UNALIGNED_OK
|
|
#endif
|
|
|
|
#ifdef sysV68 /* Try Motorola Delta style */
|
|
|
|
# define GLOBAL(symbol) global symbol
|
|
# define TEXT text
|
|
# define FILE(filename) file filename
|
|
# define invert_maybe(src,dst) dst,src
|
|
# define imm(data) &data
|
|
# define reg(register) %register
|
|
|
|
# define addl add.l
|
|
# define addql addq.l
|
|
# define blos blo.b
|
|
# define bhis bhi.b
|
|
# define bras bra.b
|
|
# define clrl clr.l
|
|
# define cmpmb cmpm.b
|
|
# define cmpw cmp.w
|
|
# define cmpl cmp.l
|
|
# define lslw lsl.w
|
|
# define lsrl lsr.l
|
|
# define movel move.l
|
|
# define movew move.w
|
|
# define moveb move.b
|
|
# define moveml movem.l
|
|
# define subl sub.l
|
|
# define subw sub.w
|
|
# define subql subq.l
|
|
|
|
# define IndBase(bd,An) (bd,An)
|
|
# define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
|
|
# define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
|
|
# define predec(An) -(An)
|
|
# define postinc(An) (An)+
|
|
|
|
#else /* default style (Sun 3, NeXT, Atari) */
|
|
|
|
# define GLOBAL(symbol) .globl symbol
|
|
# define TEXT .text
|
|
# define FILE(filename) .even
|
|
# define invert_maybe(src,dst) src,dst
|
|
# if defined(sun) || defined(mc68k)
|
|
# define imm(data) #data
|
|
# else
|
|
# define imm(data) \#data
|
|
# endif
|
|
# define reg(register) register
|
|
|
|
# define blos bcss
|
|
# if defined(sun) || defined(mc68k)
|
|
# define movel movl
|
|
# define movew movw
|
|
# define moveb movb
|
|
# endif
|
|
# define IndBase(bd,An) An@(bd)
|
|
# define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
|
|
# define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
|
|
# define predec(An) An@-
|
|
# define postinc(An) An@+
|
|
|
|
#endif /* styles */
|
|
|
|
#define Best_Len reg(d0) /* unsigned */
|
|
#define Cur_Match reg(d1) /* Ipos */
|
|
#define Loop_Counter reg(d2) /* int */
|
|
#define Scan_Start reg(d3) /* unsigned short */
|
|
#define Scan_End reg(d4) /* unsigned short */
|
|
#define Limit reg(d5) /* IPos */
|
|
#define Chain_Length reg(d6) /* unsigned */
|
|
#define Scan_Test reg(d7)
|
|
#define Scan reg(a0) /* *uch */
|
|
#define Match reg(a1) /* *uch */
|
|
#define Prev_Address reg(a2) /* *Pos */
|
|
#define Scan_Ini reg(a3) /* *uch */
|
|
#define Match_Ini reg(a4) /* *uch */
|
|
#define Stack_Pointer reg(sp)
|
|
|
|
#define MAX_MATCH 258
|
|
#define MIN_MATCH 3
|
|
#define WSIZE 32768
|
|
#define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1)
|
|
|
|
GLOBAL (_match_init)
|
|
GLOBAL (_longest_match)
|
|
|
|
TEXT
|
|
|
|
FILE ("match.S")
|
|
|
|
_match_init:
|
|
rts
|
|
|
|
/*-----------------------------------------------------------------------
|
|
* Set match_start to the longest match starting at the given string and
|
|
* return its length. Matches shorter or equal to prev_length are discarded,
|
|
* in which case the result is equal to prev_length and match_start is
|
|
* garbage.
|
|
* IN assertions: cur_match is the head of the hash chain for the current
|
|
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
|
*/
|
|
|
|
/* int longest_match (cur_match) */
|
|
|
|
#ifdef UNALIGNED_OK
|
|
# define pushreg 15928 /* d2-d6/a2-a4 */
|
|
# define popreg 7292
|
|
#else
|
|
# define pushreg 16184 /* d2-d7/a2-a4 */
|
|
# define popreg 7420
|
|
#endif
|
|
|
|
_longest_match:
|
|
movel IndBase(4,Stack_Pointer),Cur_Match
|
|
moveml imm(pushreg),predec(Stack_Pointer)
|
|
movel _max_chain_length,Chain_Length
|
|
movel _prev_length,Best_Len
|
|
movel imm(_prev),Prev_Address
|
|
movel imm(_window+MIN_MATCH),Match_Ini
|
|
movel _strstart,Limit
|
|
movel Match_Ini,Scan_Ini
|
|
addl Limit,Scan_Ini
|
|
subw imm(MAX_DIST),Limit
|
|
bhis L__limit_ok
|
|
clrl Limit
|
|
L__limit_ok:
|
|
cmpl invert_maybe(_good_match,Best_Len)
|
|
blos L__length_ok
|
|
lsrl imm(2),Chain_Length
|
|
L__length_ok:
|
|
subql imm(1),Chain_Length
|
|
#ifdef UNALIGNED_OK
|
|
movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
|
|
movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|
#else
|
|
moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
|
|
lslw imm(8),Scan_Start
|
|
moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
|
|
moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|
lslw imm(8),Scan_End
|
|
moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
|
|
#endif
|
|
bras L__do_scan
|
|
|
|
L__long_loop:
|
|
#ifdef UNALIGNED_OK
|
|
movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|
#else
|
|
moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
|
|
lslw imm(8),Scan_End
|
|
moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
|
|
#endif
|
|
|
|
L__short_loop:
|
|
lslw imm(1),Cur_Match
|
|
movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
|
|
cmpw invert_maybe(Limit,Cur_Match)
|
|
dbls Chain_Length,L__do_scan
|
|
bras L__return
|
|
|
|
L__do_scan:
|
|
movel Match_Ini,Match
|
|
addl Cur_Match,Match
|
|
#ifdef UNALIGNED_OK
|
|
cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
|
|
bne L__short_loop
|
|
cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
|
|
bne L__short_loop
|
|
#else
|
|
moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
|
|
lslw imm(8),Scan_Test
|
|
moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
|
|
cmpw invert_maybe(Scan_Test,Scan_End)
|
|
bne L__short_loop
|
|
moveb IndBase(-MIN_MATCH,Match),Scan_Test
|
|
lslw imm(8),Scan_Test
|
|
moveb IndBase(-MIN_MATCH+1,Match),Scan_Test
|
|
cmpw invert_maybe(Scan_Test,Scan_Start)
|
|
bne L__short_loop
|
|
#endif
|
|
|
|
movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
|
|
movel Scan_Ini,Scan
|
|
L__scan_loop:
|
|
cmpmb postinc(Match),postinc(Scan)
|
|
dbne Loop_Counter,L__scan_loop
|
|
|
|
subl Scan_Ini,Scan
|
|
addql imm(MIN_MATCH-1),Scan
|
|
cmpl invert_maybe(Best_Len,Scan)
|
|
bls L__short_loop
|
|
movel Scan,Best_Len
|
|
movel Cur_Match,_match_start
|
|
cmpl invert_maybe(_nice_match,Best_Len)
|
|
blos L__long_loop
|
|
L__return:
|
|
moveml postinc(Stack_Pointer),imm(popreg)
|
|
rts
|
|
|
|
#else
|
|
|
|
# if defined (__ia64__)
|
|
|
|
/* ======================== ia64 version ================================= */
|
|
|
|
/*
|
|
* 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
|
|
*
|
|
* Optimized for McKinley, but with Merced-compatibility, such as
|
|
* MIB+MIB, used wherever possible.
|
|
*
|
|
* Copyright: Sverre Jarp (HP Labs) 2001-2002
|
|
*
|
|
* See deflate.c for c-version
|
|
* Version 2 - Optimize the outer loop
|
|
*/
|
|
|
|
#include <endian.h>
|
|
|
|
#if __BYTE_ORDER == ____BIG_ENDIAN
|
|
#define first shl
|
|
#define second shr.u
|
|
#define count czx1.l
|
|
#else
|
|
#define first shr.u
|
|
#define second shl
|
|
#define count czx1.r
|
|
#endif
|
|
|
|
// 24 rotating register (r32 - r55)
|
|
|
|
#define s_vmatch0 r32
|
|
#define s_vmatch1 r33
|
|
#define s_vmatbst r34
|
|
#define s_vmatbst1 r35
|
|
#define s_amatblen r36
|
|
|
|
#define s_tm1 r56
|
|
#define s_tm2 r57
|
|
#define s_tm3 r58
|
|
#define s_tm4 r59
|
|
#define s_tm5 r60
|
|
#define s_tm6 r61
|
|
#define s_tm7 r62
|
|
#define s_tm8 r63
|
|
|
|
#define s_vlen r31
|
|
#define s_vstrstart r30
|
|
#define s_vchainlen r29
|
|
#define s_awinbest r28
|
|
#define s_vcurmatch r27
|
|
#define s_vlimit r26
|
|
#define s_vscanend r25
|
|
#define s_vscanend1 r24
|
|
#define s_anicematch r23
|
|
#define s_vscan0 r22
|
|
#define s_vscan1 r21
|
|
|
|
#define s_aprev r20
|
|
#define s_awindow r19
|
|
#define s_amatchstart r18
|
|
#define s_ascan r17
|
|
#define s_amatch r16
|
|
#define s_wmask r15
|
|
#define s_ascanend r14
|
|
|
|
#define s_vspec_cmatch r11 // next iteration
|
|
#define s_lcsave r10
|
|
#define s_prsave r9
|
|
#define s_vbestlen r8 // return register
|
|
|
|
#define s_vscan3 r3
|
|
#define s_vmatch3 r2
|
|
|
|
#define p_no p2
|
|
#define p_yes p3
|
|
#define p_shf p4 //
|
|
#define p_bn2 p5 // Use in loop (indicating bestlen != 2)
|
|
|
|
#define p_nbs p9 // not new best_len
|
|
#define p_nnc p10 // not nice_length
|
|
#define p_ll p11
|
|
#define p_end p12
|
|
|
|
#define MAX_MATCH 258
|
|
#define MIN_MATCH 4
|
|
#define WSIZE 32768
|
|
#define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
|
|
|
|
#define R_INPUT 1
|
|
#define R_LOCAL 31
|
|
#define R_OUTPUT 0
|
|
#define R_ROTATING 24
|
|
#define MLAT 3
|
|
#define SHLAT 2
|
|
|
|
#define mova mov
|
|
#define movi0 mov
|
|
#define cgtu cmp.gt.unc
|
|
#define cgeu cmp.ge.unc
|
|
#define cneu cmp.ne.unc
|
|
|
|
.global longest_match
|
|
.proc longest_match
|
|
.align 32
|
|
longest_match:
|
|
// -- Cycle: 0
|
|
.prologue
|
|
{.mmi
|
|
alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
|
|
.rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
|
|
shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
|
|
.rotp lc[MLAT+SHLAT+2]
|
|
mova s_vspec_cmatch=in0 // cur_match from input register
|
|
add s_tm1=@gprel(strstart),gp // a(a(strstart))
|
|
}{.mmi
|
|
add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
|
|
add s_tm5=@ltoff(window),gp // a(a(window))
|
|
add s_tm6=@ltoff(prev),gp // a(a(prev))
|
|
;;
|
|
}{.mmb // Cycle: 1
|
|
ld4 s_vstrstart=[s_tm1] // strstart
|
|
ld4 s_vbestlen=[s_tm3] // best_len = prev_length
|
|
brp.loop.imp .cmploop,.cmploop+48
|
|
}{.mli
|
|
add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
|
|
movl s_wmask=WSIZE-1
|
|
;;
|
|
}{.mmi // Cycle: 2
|
|
ld8 s_aprev=[s_tm6] // a(prev)
|
|
ld8 s_awindow=[s_tm5] // a(window)
|
|
.save pr, s_prsave
|
|
movi0 s_prsave=pr // save predicates
|
|
}{.mmi
|
|
add s_tm4=@gprel(good_match),gp // a(a(good_match))
|
|
add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
|
|
add s_tm8=@ltoff(match_start),gp // a(match_start)
|
|
;;
|
|
}{.mmi // Cycle: 3
|
|
ld8 s_anicematch=[s_tm7] // a(nice_match)
|
|
ld8 s_amatchstart=[s_tm8] // a(match_start)
|
|
.save ar.lc, s_lcsave
|
|
movi0 s_lcsave=ar.lc // save loop count register
|
|
}{.mmi
|
|
.body
|
|
add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
|
|
cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
|
|
mova s_vcurmatch=s_vspec_cmatch
|
|
;;
|
|
}{.mmi // Cycle: 4
|
|
ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
|
|
ld4 s_tm4=[s_tm4] // v(good_match)
|
|
add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
|
|
}{.mmi
|
|
sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
|
|
add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
|
|
and s_vspec_cmatch =s_vspec_cmatch,s_wmask
|
|
;;
|
|
}{.mmi // Cycle: 5
|
|
add s_amatblen=s_amatch,s_vbestlen //
|
|
cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
|
|
add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
|
|
}{.mmi
|
|
ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
|
|
ld1 s_vmatch0=[s_amatch],1
|
|
cgtu p0,p_no=s_vlimit,r0 // is result positive ?
|
|
;;
|
|
}{.mmi // Cycle: 6
|
|
ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
|
|
ld1.nt1 s_vmatch1=[s_amatch],2
|
|
add s_awinbest=s_awindow,s_vbestlen //
|
|
;;
|
|
}{.mmi // Cycle: 7
|
|
ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
|
|
ld1.nt1 s_vmatbst=[s_amatblen],-1
|
|
(p_no) mova s_vlimit=r0
|
|
;;
|
|
}{.mmi // Cycle: 8
|
|
(p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
|
|
(p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
|
|
shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
|
|
}{.mmi
|
|
cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
|
|
;;
|
|
}{.mmi // Cycle: 9
|
|
ld1.nt1 s_vscan3=[s_ascan]
|
|
ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
|
|
mova s_vlen=3
|
|
}{.mmi
|
|
(p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
|
|
;;
|
|
}{.mmi // Cycle: 10
|
|
ld1.nt1 s_vmatch3=[s_amatch]
|
|
// p_ll switched on as soon as we get a mismatch:
|
|
cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
|
|
cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
|
|
}{.mib
|
|
cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
|
|
(p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
|
|
(p_ll) br.cond.dpnt.many .test_more
|
|
;;
|
|
}
|
|
|
|
.next_iter:
|
|
{.mmi // Cycle 0
|
|
add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
|
|
mov s_vcurmatch=s_vspec_cmatch // current value
|
|
add s_vchainlen=-1,s_vchainlen // --chain_length
|
|
}{.mib
|
|
cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
|
|
and s_vspec_cmatch=s_vspec_cmatch,s_wmask
|
|
(p_end) br.cond.dptk.many .terminate
|
|
;;
|
|
}{.mmi // Cycle 1
|
|
ld1 s_vmatch0=[s_amatch],1 // load match[0]
|
|
// compute prev[cur_match]:
|
|
shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
|
|
cmp.eq.unc p_end,p0=s_vchainlen,r0
|
|
} {.mib
|
|
nop.m 0
|
|
add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match
|
|
(p_end) br.cond.dptk.many .terminate
|
|
;;
|
|
}{.mmi // Cycle 2 (short)
|
|
ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match
|
|
;;
|
|
}{.mmi // Cycle 3 (short)
|
|
ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len]
|
|
cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false'
|
|
;;
|
|
}{.mmi // Cycle 4 (short)
|
|
// load match[1] - - note: match += 3 (in total):
|
|
ld1.nt1 s_vmatch1=[s_amatch],2
|
|
;;
|
|
// Cycle 5 (short)
|
|
(p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1]
|
|
}{.mib // Here we (MOST LIKELY) pay a L2-fetch stall
|
|
// p_ll switched on as soon as we get a mismatch:
|
|
cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
|
|
cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
|
|
(p_ll) br.cond.dptk.many .next_iter
|
|
;;
|
|
}{.mmi // Cycle 6
|
|
ld1.nt1 s_vmatch3=[s_amatch]
|
|
mova s_vlen=3
|
|
nop.i 0
|
|
}{.mib
|
|
cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
|
|
(p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
|
|
(p_ll) br.cond.dptk.many .next_iter
|
|
;;
|
|
}
|
|
|
|
// We have passed the first hurdle - Are there additional matches ???
|
|
|
|
.test_more:
|
|
{.mmi // Cycle 0
|
|
and s_tm3=7,s_ascan // get byte offset
|
|
and s_tm4=7,s_amatch // get byte offset
|
|
movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual
|
|
}{.mib
|
|
cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ?
|
|
(p_no) br.cond.dptk.many .only3
|
|
;;
|
|
}{.mmi // Cycle 1
|
|
and s_tm1=-8,s_ascan // get aligned address
|
|
shladd s_tm3=s_tm3,3,r0
|
|
movi0 ar.lc=31 // 32 times around the loop (8B at a time)
|
|
}{.mib
|
|
and s_tm2=-8,s_amatch // get aligned address
|
|
shladd s_tm4=s_tm4,3,r0
|
|
nop.b 0
|
|
;;
|
|
}{.mmi // Cycle 2
|
|
ld8.nt1 scan[1]=[s_tm1],8 // load first chunk
|
|
sub s_tm5=64,s_tm3 // 64 - amount
|
|
movi0 pr.rot=1<<16
|
|
}{.mmi
|
|
ld8.nt1 match[1]=[s_tm2],8 // load first chunk
|
|
sub s_tm6=64,s_tm4 // 64 - amount
|
|
add s_vlen=-8,s_vlen // will be updated at least once
|
|
;;
|
|
}
|
|
.align 32
|
|
.cmploop:
|
|
{.mmi // Cycle 0
|
|
(lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk
|
|
(lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen
|
|
(lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3
|
|
}{.mib
|
|
(lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if !=
|
|
(lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4
|
|
(p_no) br.cond.dpnt.many .mismatch
|
|
;;
|
|
}{.mii // Cycle 1
|
|
(lc[0]) ld8 match[0]=[s_tm2],8
|
|
// shift left(le) or right(be):
|
|
(lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5
|
|
(lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6
|
|
}{.mmb
|
|
(lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
|
|
(lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
|
|
br.ctop.dptk.many .cmploop
|
|
;;
|
|
}{.mfi
|
|
mov s_vlen=258
|
|
nop.f 0
|
|
}{.mfi
|
|
nop.f 0 // realign
|
|
;;
|
|
}
|
|
.mismatch:
|
|
{.mii // Cycle 0 (short)
|
|
(p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character
|
|
nop.i 0
|
|
;;
|
|
// Cycle 1 (short)
|
|
(p_no) count s_tm1=s_tm2
|
|
;;
|
|
}{.mib // Cycle 2 (short)
|
|
(p_no) add s_vlen=s_vlen,s_tm1 // effective length
|
|
nop.i 0
|
|
clrrrb
|
|
;;
|
|
}
|
|
|
|
.only3:
|
|
{.mib // Cycle 0 (short)
|
|
cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ?
|
|
(p_nbs) br.cond.dpnt.many .next_iter // if not, reiterate
|
|
;;
|
|
}{.mmi // Cycle 1 (short)
|
|
ld4 s_tm7=[s_anicematch] // nice_match
|
|
st4 [s_amatchstart]= s_vcurmatch
|
|
add s_ascanend=s_ascan,s_vlen // reset with best_len
|
|
;;
|
|
}{.mmi // Cycle 2 (short)
|
|
mova s_vbestlen=s_vlen
|
|
add s_ascanend=-3,s_ascanend // remember extra offset
|
|
;;
|
|
}{.mmi // Cycle 3 (short)
|
|
ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
|
|
add s_awinbest=s_awindow,s_vbestlen // update with new best_len
|
|
cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2
|
|
;;
|
|
}{.mib // Cycle 4 (short)
|
|
// scan_end1=scan[best_len-1] NB: s_ascanend reset:
|
|
ld1.nt1 s_vscanend1=[s_ascanend],1
|
|
cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match
|
|
(p_nnc) br.cond.dptk.many .next_iter
|
|
;;
|
|
}
|
|
.terminate:
|
|
{.mii // Cycle 0/1
|
|
nop.m 0
|
|
movi0 ar.lc=s_lcsave
|
|
movi0 pr=s_prsave,-1
|
|
}{.mbb
|
|
nop.m 0
|
|
nop.b 0
|
|
br.ret.sptk.many rp // ret0 is identical to best_len
|
|
;;
|
|
}
|
|
.endp
|
|
|
|
.global match_init
|
|
.proc match_init
|
|
match_init:
|
|
sub ret0=ret0,ret0
|
|
br.ret.sptk.many rp
|
|
.endp
|
|
|
|
# else
|
|
error: this asm version is for 386 or 680x0 or ia64 only
|
|
# endif /* __ia64__ */
|
|
#endif /* mc68000 || mc68020 */
|
|
#endif /* i386 || _I386 */
|