mirror of
https://github.com/ThomasDickey/mawk-snapshots.git
synced 2026-01-27 03:14:29 +00:00
110 lines
2.1 KiB
Awk
110 lines
2.1 KiB
Awk
# $MawkId: wfrq0.awk,v 1.3 2020/01/20 11:52:06 tom Exp $
|
|
# Test-script for MAWK
|
|
###############################################################################
|
|
# copyright 1993, Michael D. Brennan
|
|
#
|
|
# This is a source file for mawk, an implementation of
|
|
# the AWK programming language.
|
|
#
|
|
# Mawk is distributed without warranty under the terms of
|
|
# the GNU General Public License, version 2, 1991.
|
|
###############################################################################
|
|
|
|
# this program finds the twenty most frequent
|
|
# words in document using a heap sort at the end
|
|
#
|
|
#
|
|
|
|
function down_heap(i, k,hold)
|
|
{
|
|
while ( 1 )
|
|
{
|
|
if ( compare(heap[2*i], heap[2*i+1]) <= 0 ) k = 2*i
|
|
else k = 2*i + 1
|
|
|
|
if ( compare(heap[i],heap[k]) <= 0 ) return
|
|
|
|
hold = heap[k] ; heap[k] = heap[i] ; heap[i] = hold
|
|
i = k
|
|
}
|
|
}
|
|
|
|
# compares two values of form "number word"
|
|
# by number and breaks ties by word (reversed)
|
|
|
|
function compare(s1, s2, t, X)
|
|
{
|
|
t = (s1+0) - (s2+0) # forces types to number
|
|
|
|
if ( t == 0 )
|
|
{
|
|
split(s1, X); s1 = X[2]
|
|
split(s2, X); s2 = X[2]
|
|
if ( s2 < s1 ) return -1
|
|
return s1 < s2
|
|
}
|
|
|
|
return t
|
|
}
|
|
|
|
|
|
BEGIN { RS = "[^a-zA-Z]+" ; BIG = "999999:" }
|
|
|
|
{ cnt[$0]++ }
|
|
|
|
END { delete cnt[ "" ]
|
|
|
|
# load twenty values
|
|
j = 1
|
|
for( i in cnt )
|
|
{
|
|
heap[j] = num_word( cnt[i] , i )
|
|
delete cnt[i] ;
|
|
if ( ++j == 21 ) break ;
|
|
}
|
|
|
|
# make some sentinels
|
|
for( i = j ; i < 43 ; i++ ) heap[i] = BIG
|
|
|
|
h_empty = j # save the first empty slot
|
|
# make a heap with the smallest in slot 1
|
|
for( i = h_empty - 1 ; i > 0 ; i-- ) down_heap(i)
|
|
|
|
# examine the rest of the values
|
|
for ( i in cnt )
|
|
{
|
|
j = num_word(cnt[i], i)
|
|
if ( compare(j, heap[1]) > 0 )
|
|
{ # its bigger
|
|
# take the smallest out of the heap and readjust
|
|
heap[1] = j
|
|
down_heap(1)
|
|
}
|
|
}
|
|
|
|
h_empty-- ;
|
|
|
|
# what's left are the twenty largest
|
|
# smallest at the top
|
|
#
|
|
|
|
i = 20
|
|
while ( h_empty > 1 )
|
|
{
|
|
buffer[i--] = heap[1]
|
|
heap[1] = heap[h_empty]
|
|
heap[h_empty] = BIG
|
|
down_heap(1)
|
|
h_empty--
|
|
}
|
|
buffer[i--] = heap[1]
|
|
|
|
for(j = 1 ; j <= 20 ; j++ ) print buffer[j]
|
|
}
|
|
|
|
|
|
function num_word(num, word)
|
|
{
|
|
return sprintf("%3d %s", num, word)
|
|
}
|