mirror of
https://github.com/ThomasDickey/mawk-snapshots.git
synced 2026-01-27 03:14:29 +00:00
2032 lines
46 KiB
Groff
2032 lines
46 KiB
Groff
.\" $MawkId: mawk.1,v 1.67 2024/09/05 22:23:55 tom Exp $
|
|
.\" ###########################################################################
|
|
.\" # copyright 2008-2023,2024, Thomas E. Dickey
|
|
.\" # copyright 1996, Michael D. Brennan
|
|
.\" #
|
|
.\" # This is a source file for mawk, an implementation of
|
|
.\" # the AWK programming language.
|
|
.\" #
|
|
.\" # Mawk is distributed without warranty under the terms of
|
|
.\" # the GNU General Public License, version 2, 1991.
|
|
.\" ###########################################################################
|
|
.ds N Mawk
|
|
.ds n mawk
|
|
.TH MAWK 1 2024-09-05 "Version 1.3.4" "User commands"
|
|
.\" strings
|
|
.ds ex \fIexpr\fR
|
|
.\" Bulleted paragraph
|
|
.de bP
|
|
.ie n .IP \(bu 4
|
|
.el .IP \(bu 2
|
|
..
|
|
.\" Escape single quotes in literal strings from groff's Unicode transform.
|
|
.ie \n(.g \{\
|
|
.ds `` \(lq
|
|
.ds '' \(rq
|
|
.ds ' \(aq
|
|
.\}
|
|
.el \{\
|
|
.ie t .ds `` ``
|
|
.el .ds `` ""
|
|
.ie t .ds '' ''
|
|
.el .ds '' ""
|
|
.ie t .ds ' \(aq
|
|
.el .ds ' '
|
|
.\}
|
|
.\" **************************************************************************
|
|
.SH NAME
|
|
mawk \-
|
|
pattern scanning and text processing language
|
|
.\" **************************************************************************
|
|
.SH SYNOPSIS
|
|
\fB\*n\fP
|
|
[\-\fBW
|
|
.IR option ]
|
|
[\-\fBF
|
|
.IR value ]
|
|
[\-\fBv
|
|
.IR var=value ]
|
|
[\-\|\-] 'program text' [file ...]
|
|
.br
|
|
\fB\*n\fP
|
|
[\-\fBW
|
|
.IR option ]
|
|
[\-\fBF
|
|
.IR value ]
|
|
[\-\fBv
|
|
.IR var=value ]
|
|
[\-\fBf
|
|
.IR program-file ]
|
|
[\-\|\-] [file ...]
|
|
.\" **************************************************************************
|
|
.SH DESCRIPTION
|
|
\fB\*n\fP
|
|
is an interpreter for the AWK Programming Language.
|
|
The AWK language
|
|
is useful for manipulation of data files,
|
|
text retrieval and processing,
|
|
and for prototyping and experimenting with algorithms.
|
|
\fB\*n\fP
|
|
is a \fInew awk\fR meaning it implements the AWK language as
|
|
defined in Aho, Kernighan and Weinberger,
|
|
.I "The AWK Programming Language,"
|
|
Addison-Wesley Publishing, 1988 (hereafter referred to as
|
|
the AWK book.)
|
|
\fB\*n\fP
|
|
conforms to the POSIX 1003.2
|
|
(draft 11.3)
|
|
definition of the AWK language
|
|
which contains a few features not described in the AWK book,
|
|
and \fB\*n\fP provides a small number of extensions.
|
|
.PP
|
|
An AWK program is a sequence of \fIpattern {action}\fR pairs and
|
|
function definitions.
|
|
Short programs are entered on the command line
|
|
usually enclosed in ' ' to avoid shell
|
|
interpretation.
|
|
Longer programs can be read in from a
|
|
file with the \-f option.
|
|
Data input is read from the list of files on
|
|
the command line or from standard input when the list is empty.
|
|
The input is broken into records as determined by the
|
|
record separator variable, \fBRS\fR.
|
|
Initially,
|
|
.B RS
|
|
= \*(``\en\*('' and records are synonymous with lines.
|
|
Each record is compared against each
|
|
.I pattern
|
|
and if it matches, the program text for
|
|
.I "{action}"
|
|
is executed.
|
|
.\" **************************************************************************
|
|
.SH OPTIONS
|
|
.TP \w'\-\fBW'u+\w'\fRsprintf=\fInum\fR'u+2n
|
|
\-\fBF \fIvalue\fP
|
|
sets the field separator, \fBFS\fR, to
|
|
.IR value .
|
|
.TP
|
|
\-\fBf \fIfile\fR
|
|
Program text is read from \fIfile\fR instead of from the
|
|
command line.
|
|
Multiple
|
|
.B \-f
|
|
options are allowed.
|
|
.TP
|
|
\-\fBv \fIvar=value\fR
|
|
assigns
|
|
.I value
|
|
to program variable
|
|
.IR var .
|
|
.TP
|
|
\-\|\-
|
|
indicates the unambiguous end of options.
|
|
.PP
|
|
The above options will be available with any POSIX compatible
|
|
implementation of AWK.
|
|
Implementation specific options are prefaced with
|
|
.BR \-W .
|
|
\fB\*n\fP
|
|
provides these:
|
|
.TP
|
|
\-\fBW \fRdump
|
|
writes an assembler like listing of the internal
|
|
representation of the program to stdout and exits 0
|
|
(on successful compilation).
|
|
.TP
|
|
\-\fBW \fRexec \fIfile\fR
|
|
Program text is read from
|
|
.I file
|
|
and this is the last option.
|
|
.IP
|
|
This is a useful alternative to \-\fBf\fP on systems that support the
|
|
.B #!
|
|
\*(``magic number\*('' convention for executable scripts.
|
|
Those implicitly pass the pathname of the script itself as the final
|
|
parameter, and expect no more than one \*(``\-\*('' option on the \fB#!\fP line.
|
|
Because \fB\*n\fP can combine multiple \-\fBW\fP options separated by
|
|
commas, you can use this option when an additional \-\fBW\fP option is needed.
|
|
.TP
|
|
\-\fBW \fRhelp
|
|
prints a usage message to stderr and exits (same as \*(``\-\fBW\ \fRusage\*('').
|
|
.TP
|
|
\-\fBW \fRinteractive
|
|
sets unbuffered writes to stdout and line buffered reads from stdin.
|
|
Records from stdin are lines regardless of the value of
|
|
.BR RS .
|
|
.TP
|
|
\-\fBW \fRposix
|
|
modifies \fB\*n\fP's behavior to be more POSIX-compliant:
|
|
.RS
|
|
.bP
|
|
forces
|
|
\fB\*n\fP
|
|
not to consider '\en' to be space.
|
|
.RE
|
|
.IP
|
|
The original \*(``posix_space\*('' is recognized, but deprecated.
|
|
.TP
|
|
\-\fBW \fRrandom=\fInum\fR
|
|
calls \fBsrand\fP with the given parameter
|
|
(and overrides the auto-seeding behavior).
|
|
.TP
|
|
\-\fBW \fRsprintf=\fInum\fR
|
|
adjusts the size of
|
|
\fB\*n\fP's
|
|
internal sprintf buffer to
|
|
.I num
|
|
bytes.
|
|
More than rare use of this option indicates
|
|
\fB\*n\fP
|
|
should be recompiled.
|
|
.TP
|
|
\-\fBW \fRtraditional
|
|
Omit features such as interval expressions which were not supported by
|
|
traditional \fIawk\fP.
|
|
.TP
|
|
\-\fBW \fRusage
|
|
prints a usage message to stderr and exits (same as \*(``\-\fBW\ \fRhelp\*('').
|
|
.TP
|
|
\-\fBW \fRversion
|
|
\fB\*n\fP
|
|
writes its version and copyright
|
|
to stdout and compiled limits to
|
|
stderr and exits 0.
|
|
.PP
|
|
\fB\*n\fP accepts abbreviations for any of these options, e.g.,
|
|
\*(``\-\fBW\ \fRv\*('' and \*(``\-\fBW\fRv\*(''
|
|
both tell \fB\*n\fP to show its version.
|
|
.PP
|
|
\fB\*n\fP
|
|
allows multiple \fB\-W\fP options to be combined by separating the options
|
|
with commas, e.g., \-Wsprint=2000,posix.
|
|
This is useful for executable
|
|
.B #!
|
|
\*(``magic number\*('' invocations in which only one argument is supported,
|
|
e.g., \-\fBWinteractive,exec\fP.
|
|
.\" **************************************************************************
|
|
.SH "THE AWK LANGUAGE"
|
|
.SS "\fB1. Program structure"
|
|
An AWK program is a sequence of
|
|
.I "pattern {action}"
|
|
pairs and user
|
|
function definitions.
|
|
.PP
|
|
A pattern can be:
|
|
.nf
|
|
.RS 5
|
|
\fBBEGIN\fR
|
|
\fBEND\fR
|
|
expression
|
|
expression , expression
|
|
.sp
|
|
.RE
|
|
.fi
|
|
One, but not both,
|
|
of \fIpattern {action}\fR can be omitted.
|
|
If
|
|
.I {action}
|
|
is omitted it is implicitly { print }.
|
|
If
|
|
.I pattern
|
|
is omitted, then it is implicitly matched.
|
|
.B BEGIN
|
|
and
|
|
.B END
|
|
patterns require an action.
|
|
.PP
|
|
Statements are terminated by newlines, semi-colons or both.
|
|
Groups of statements such as
|
|
actions or loop bodies are blocked via {\ ...\ } as in C.
|
|
The last statement in a block doesn't need a terminator.
|
|
Blank lines have no meaning; an empty statement is terminated with a
|
|
semi-colon.
|
|
Long statements can be continued with a backslash, \e\|.
|
|
A statement can be broken
|
|
without a backslash after a comma, left brace, &&, ||,
|
|
.BR do ,
|
|
.BR else ,
|
|
the right parenthesis of an
|
|
.BR if ,
|
|
.B while
|
|
or
|
|
.B for
|
|
statement, and the
|
|
right parenthesis of a function definition.
|
|
A comment starts with # and extends to, but does not include
|
|
the end of line.
|
|
.PP
|
|
The following statements control program flow inside blocks.
|
|
.RS 5
|
|
.PP
|
|
.B if
|
|
( \*(ex )
|
|
.I statement
|
|
.PP
|
|
.B if
|
|
( \*(ex )
|
|
.I statement
|
|
.B else
|
|
.I statement
|
|
.PP
|
|
.B while
|
|
( \*(ex )
|
|
.I statement
|
|
.PP
|
|
.B do
|
|
.I statement
|
|
.B while
|
|
( \*(ex )
|
|
.PP
|
|
.B for
|
|
(
|
|
\fIopt_expr\fR ;
|
|
\fIopt_expr\fR ;
|
|
\fIopt_expr\fR
|
|
)
|
|
.I statement
|
|
.PP
|
|
.B for
|
|
( \fIvar \fBin \fIarray\fR )
|
|
.I statement
|
|
.PP
|
|
.B continue
|
|
.PP
|
|
.B break
|
|
.RE
|
|
.\"
|
|
.SS "\fB2. Data types, conversion and comparison"
|
|
There are two basic data types, numeric and string.
|
|
Numeric constants can be integer like \-2,
|
|
decimal like 1.08, or in scientific notation like \-1.1e4 or .28E\-3.
|
|
All numbers are represented internally and all
|
|
computations are done in floating point arithmetic.
|
|
So for example, the expression
|
|
0.2e2 == 20
|
|
is true and true is represented as 1.0.
|
|
.PP
|
|
String constants are enclosed in double quotes.
|
|
.sp
|
|
.ce
|
|
"This is a string with a newline at the end.\en"
|
|
.sp
|
|
Strings can be continued across a line by escaping (\e) the newline.
|
|
The following escape sequences are recognized.
|
|
.nf
|
|
.sp
|
|
\e\e \e
|
|
\e" "
|
|
\ea alert, ascii 7
|
|
\eb backspace, ascii 8
|
|
\et tab, ascii 9
|
|
\en newline, ascii 10
|
|
\ev vertical tab, ascii 11
|
|
\ef formfeed, ascii 12
|
|
\er carriage return, ascii 13
|
|
\eddd 1, 2 or 3 octal digits for ascii ddd
|
|
\exhh 1 or 2 hex digits for ascii hh
|
|
.sp
|
|
.fi
|
|
If you escape any other character \ec, you get \ec, i.e.,
|
|
\fB\*n\fP
|
|
ignores the escape.
|
|
.PP
|
|
There are really three basic data types; the third is
|
|
.I "number and string"
|
|
which has both a numeric value and a string value
|
|
at the same time.
|
|
User defined variables come into existence when first referenced
|
|
and are initialized to
|
|
.IR null ,
|
|
a number and string value which has numeric value 0 and string value
|
|
"".
|
|
Non-trivial number and string typed data come from input
|
|
and are typically stored in fields.
|
|
(See section 4).
|
|
.PP
|
|
The type of an expression is determined by its context and automatic
|
|
type conversion occurs if needed.
|
|
For example, to evaluate the statements
|
|
.nf
|
|
.sp
|
|
y = x + 2 ; z = x "hello"
|
|
.sp
|
|
.fi
|
|
The value stored in variable y will be typed numeric.
|
|
If x is not numeric,
|
|
the value read from x is converted to numeric before it is added to
|
|
2 and stored in y.
|
|
The value stored in variable z will be typed
|
|
string, and the value of x will be converted to string if necessary
|
|
and concatenated with "hello".
|
|
(Of course, the value and type stored in x is not changed by any conversions.)
|
|
A string expression is converted to numeric using its longest
|
|
numeric prefix as with
|
|
\fBatof\fP(3).
|
|
A numeric expression is converted to string by replacing
|
|
.I expr
|
|
with
|
|
.BR sprintf(CONVFMT ,
|
|
.IR expr ),
|
|
unless
|
|
.I expr
|
|
can be represented on the host machine as an exact integer then
|
|
it is converted to \fBsprintf\fR("%d", \*(ex).
|
|
.B Sprintf()
|
|
is an AWK built-in that duplicates the functionality of
|
|
\fBsprintf\fP(3),
|
|
and
|
|
\fBCONVFMT\fP
|
|
is a built-in variable used for internal conversion
|
|
from number to string and initialized to "%.6g".
|
|
Explicit type conversions can be forced,
|
|
\*(ex ""
|
|
is string and
|
|
.IR expr +0
|
|
is numeric.
|
|
.PP
|
|
To evaluate,
|
|
\*(ex\d1\u \fBrel-op\fR \*(ex\d2\u,
|
|
if both operands are numeric or number and string then the comparison
|
|
is numeric; if both operands are string the comparison is string;
|
|
if one operand is string, the non-string operand is converted and
|
|
the comparison is string.
|
|
The result is numeric, 1 or 0.
|
|
.PP
|
|
In boolean contexts such as,
|
|
\fBif\fR ( \*(ex ) \fIstatement\fR,
|
|
a string expression evaluates true if and only if it is not the
|
|
empty string "";
|
|
numeric values if and only if not numerically zero.
|
|
.\"
|
|
.SS "\fB3. Regular expressions"
|
|
In the AWK language, records, fields and strings are often
|
|
tested for matching a
|
|
.IR "regular expression" .
|
|
Regular expressions are enclosed in slashes, and
|
|
.nf
|
|
.sp
|
|
\*(ex ~ /\fIr\fR/
|
|
.sp
|
|
.fi
|
|
is an AWK expression that evaluates to 1 if \*(ex \*(``matches\*(''
|
|
.IR r ,
|
|
which means a substring of \*(ex is in the set of strings
|
|
defined by
|
|
.IR r .
|
|
With no match the expression evaluates to 0; replacing
|
|
~ with the \*(``not match\*('' operator, !~ , reverses the meaning.
|
|
As pattern-action pairs,
|
|
.nf
|
|
.sp
|
|
/\fIr\fR/ { \fIaction\fR } and\
|
|
\fB$0\fR ~ /\fIr\fR/ { \fIaction\fR }
|
|
.sp
|
|
.fi
|
|
are the same,
|
|
and for each input record that matches
|
|
.IR r ,
|
|
.I action
|
|
is executed.
|
|
In fact, /\fIr\fR/ is an AWK expression that is
|
|
equivalent to (\fB$0\fR ~ /\fIr\fR/) anywhere except when on the
|
|
right side of a match operator or passed as an argument to
|
|
a built-in function that expects a regular expression
|
|
argument.
|
|
.PP
|
|
AWK uses extended regular expressions as with
|
|
the \fB\-E\fP option of \fBgrep\fP(1).
|
|
The regular expression metacharacters, i.e., those with special
|
|
meaning in regular expressions are
|
|
.nf
|
|
.sp
|
|
\\ ^ $ . [ ] | ( ) * + ? { }
|
|
.sp
|
|
.fi
|
|
If the command line option \fI-W traditional\fP is used, these are omitted:
|
|
.nf
|
|
.sp
|
|
{ }
|
|
.sp
|
|
.fi
|
|
are also regular expression metacharacters, and in this mode,
|
|
require escaping to be a literal character.
|
|
|
|
Regular expressions are built up from characters as follows:
|
|
.RS 5
|
|
.TP \w'[^c\d1\uc\d2\uc\d3\u...]'u+1n
|
|
\fIc\fR
|
|
matches any non-metacharacter
|
|
.IR c .
|
|
.TP
|
|
\e\fIc\fR
|
|
matches a character defined by the same
|
|
escape sequences used
|
|
in string constants or the literal
|
|
character \fIc\fR if \e\fIc\fR is not an escape sequence.
|
|
.TP
|
|
\&\.
|
|
matches any character (including newline).
|
|
.TP
|
|
^
|
|
matches the front of a string.
|
|
.TP
|
|
$
|
|
matches the back of a string.
|
|
.TP
|
|
[c\d1\uc\d2\uc\d3\u...]
|
|
matches any character in the class
|
|
c\d1\uc\d2\uc\d3\u...\ .
|
|
An interval of characters is denoted
|
|
c\d1\u\-c\d2\u inside a class [...].
|
|
.TP
|
|
[^c\d1\uc\d2\uc\d3\u...]
|
|
matches any character not in the class
|
|
c\d1\uc\d2\uc\d3\u...
|
|
.RE
|
|
.sp
|
|
Regular expressions are built up from other regular expressions
|
|
as follows:
|
|
.RS 5
|
|
.TP \w'[^c\d1\uc\d2\uc\d3\u...]'u+1n
|
|
\fIr\fR\d1\u\fIr\fR\d2\u
|
|
matches
|
|
\fIr\fR\d1\u
|
|
followed immediately by
|
|
\fIr\fR\d2\u
|
|
(\fIconcatenation\fR).
|
|
.sp
|
|
.TP
|
|
\fIr\fR\d1\u | \fIr\fR\d2\u
|
|
matches
|
|
\fIr\fR\d1\u or
|
|
\fIr\fR\d2\u
|
|
(\fIalternation\fR).
|
|
.sp
|
|
.TP
|
|
\fIr\fR*
|
|
matches \fIr\fR repeated zero or more times.
|
|
.TP
|
|
\fIr\fR+
|
|
matches \fIr\fR repeated one or more times.
|
|
.TP
|
|
\fIr\fR?
|
|
matches \fIr\fR zero or once.
|
|
(\fIrepetition\fR).
|
|
.TP
|
|
(\fIr\fR)
|
|
matches \fIr\fR
|
|
(\fIgrouping\fR).
|
|
.sp
|
|
.TP
|
|
\fIr\fR{n}
|
|
matches \fIr\fR exactly n times.
|
|
.TP
|
|
\fIr\fR{n,}
|
|
matches \fIr\fR repeated n or more times.
|
|
.TP
|
|
\fIr\fR{n,m}
|
|
matches \fIr\fR repeated n to m (inclusive) times.
|
|
.TP
|
|
\fIr\fR{,m}
|
|
matches \fIr\fR repeated 0 to m times (a non-standard option).
|
|
.RE
|
|
.PP
|
|
The increasing \fBprecedence of operators\fR is:
|
|
.nf
|
|
.sp
|
|
alternation concatenation repetition grouping
|
|
.sp
|
|
.fi
|
|
.PP
|
|
For example,
|
|
.nf
|
|
.sp
|
|
/^[_a\-zA\-Z][_a\-zA\-Z0\-9]*$/ and
|
|
/^[\-+]?([0\-9]+\e\|.?|\e\|.[0\-9])[0\-9]*([eE][\-+]?[0\-9]+)?$/
|
|
.sp
|
|
.fi
|
|
are matched by AWK identifiers and AWK numeric constants
|
|
respectively.
|
|
Note that \*(``.\*('' has to be escaped to be
|
|
recognized as a decimal point, and that metacharacters are not
|
|
special inside character classes.
|
|
.PP
|
|
Any expression can be used on the right hand side of the ~ or !~
|
|
operators or
|
|
passed to a built-in that expects
|
|
a regular expression.
|
|
If needed, it is converted to string, and then interpreted
|
|
as a regular expression.
|
|
For example,
|
|
.nf
|
|
.sp
|
|
BEGIN { identifier = "[_a\-zA\-Z][_a\-zA\-Z0\-9]*" }
|
|
|
|
$0 ~ "^" identifier
|
|
.sp
|
|
.fi
|
|
prints all lines that start with an AWK identifier.
|
|
.PP
|
|
\fB\*n\fP
|
|
recognizes the empty regular expression, //\|, which matches the
|
|
empty string and hence is matched by any string at the front,
|
|
back and between every character.
|
|
For example,
|
|
.nf
|
|
.sp
|
|
echo abc | \*n '{ gsub(//, "X")' ; print }
|
|
XaXbXcX
|
|
.sp
|
|
.fi
|
|
.\"
|
|
.SS "\fB4. Records and fields"
|
|
Records are read in one at a time, and stored in the
|
|
.I field
|
|
variable
|
|
.BR $0 .
|
|
The record is split into
|
|
.I fields
|
|
which are stored in
|
|
.BR $1 ,
|
|
.BR $2 ", ...,"
|
|
.BR $NF .
|
|
The built-in variable
|
|
.B NF
|
|
is set to the number of fields,
|
|
and
|
|
.B NR
|
|
and
|
|
.B FNR
|
|
are incremented by 1.
|
|
Fields above
|
|
.B $NF
|
|
are set to "".
|
|
.PP
|
|
Assignment to
|
|
.B $0
|
|
causes the fields and
|
|
.B NF
|
|
to be recomputed.
|
|
Assignment to
|
|
.B NF
|
|
or to a field
|
|
causes
|
|
.B $0
|
|
to be reconstructed by
|
|
concatenating the
|
|
.B $i's
|
|
separated by
|
|
.BR OFS .
|
|
Assignment to a field with index greater than
|
|
.BR NF ,
|
|
increases
|
|
.B NF
|
|
and causes
|
|
.B $0
|
|
to be reconstructed.
|
|
.PP
|
|
Data input stored in fields
|
|
is string, unless the entire field has numeric
|
|
form and then the type is number and string.
|
|
For example,
|
|
.sp
|
|
.nf
|
|
echo 24 24E |
|
|
\*n '{ print($1>100, $1>"100", $2>100, $2>"100") }'
|
|
0 1 1 1
|
|
.fi
|
|
.sp
|
|
.B $0
|
|
and
|
|
.B $2
|
|
are string and
|
|
.B $1
|
|
is number and string.
|
|
The first comparison is numeric, the second is string, the third is string
|
|
(100 is converted to "100"),
|
|
and the last is string.
|
|
.\"
|
|
.SS "\fB5. Expressions and operators"
|
|
The expression syntax is similar to C.
|
|
Primary expressions are numeric constants,
|
|
string constants, variables, fields, arrays and function calls.
|
|
The identifier
|
|
for a variable, array or function can be a sequence of
|
|
letters, digits and underscores, that does
|
|
not start with a digit.
|
|
Variables are not declared; they exist when first referenced and
|
|
are initialized to
|
|
.IR null .
|
|
.PP
|
|
New
|
|
expressions are composed with the following operators in
|
|
order of increasing precedence.
|
|
.PP
|
|
.RS 5
|
|
.nf
|
|
.vs +2p \" open up a little
|
|
\fIassignment\fR = += \-= *= /= %= ^=
|
|
\fIconditional\fR ? :
|
|
\fIlogical or\fR ||
|
|
\fIlogical and\fR &&
|
|
\fIarray membership\fR \fBin
|
|
\fImatching\fR ~ !~
|
|
\fIrelational\fR < > <= >= == !=
|
|
\fIconcatenation\fR (no explicit operator)
|
|
\fIadd ops\fR + \-
|
|
\fImul ops\fR * / %
|
|
\fIunary\fR + \-
|
|
\fIlogical not\fR !
|
|
\fIexponentiation\fR ^
|
|
\fIinc and dec\fR ++ \-\|\- (both post and pre)
|
|
\fIfield\fR $
|
|
.vs
|
|
.RE
|
|
.PP
|
|
.fi
|
|
Assignment, conditional and exponentiation associate right to
|
|
left; the other operators associate left to right.
|
|
Any expression can be parenthesized.
|
|
.\"
|
|
.SS "\fB6. Arrays"
|
|
.ds ae \fIarray\fR[\fIexpr\fR]
|
|
Awk provides one-dimensional arrays.
|
|
Array elements are expressed
|
|
as \*(ae.
|
|
.I Expr
|
|
is internally converted to string type, so, for example,
|
|
A[1] and A["1"] are the same element and the actual
|
|
index is "1".
|
|
Arrays indexed by strings are called associative arrays.
|
|
Initially an array is empty; elements exist when first accessed.
|
|
An expression,
|
|
\fIexpr\fB in\fI array\fR
|
|
evaluates to 1 if
|
|
\*(ae
|
|
exists, else to 0.
|
|
.PP
|
|
There is a form of the
|
|
.B for
|
|
statement that loops over each index of an array.
|
|
.nf
|
|
.sp
|
|
\fBfor\fR ( \fIvar\fB in \fIarray \fR) \fIstatement\fR
|
|
.sp
|
|
.fi
|
|
sets
|
|
.I var
|
|
to each index of
|
|
.I array
|
|
and executes
|
|
.IR statement .
|
|
The order that
|
|
.I var
|
|
transverses the indices of
|
|
.I array
|
|
is not defined.
|
|
.PP
|
|
The statement,
|
|
.B delete
|
|
\*(ae,
|
|
causes
|
|
\*(ae
|
|
not to exist.
|
|
\fB\*n\fP
|
|
supports the
|
|
.B delete
|
|
.I array
|
|
feature, which deletes all elements of
|
|
.IR array .
|
|
.PP
|
|
Multidimensional arrays are synthesized with concatenation using
|
|
the built-in variable
|
|
.BR SUBSEP .
|
|
\fIarray\fR[\fIexpr\fR\d1\u,\|\fIexpr\fR\d2\u]
|
|
is equivalent to
|
|
\fIarray\fR[\fIexpr\fR\d1\u \fBSUBSEP \fIexpr\fR\d2\u].
|
|
Testing for a multidimensional element uses a parenthesized index,
|
|
such as
|
|
.sp
|
|
.nf
|
|
if ( (i, j) in A ) print A[i, j]
|
|
.fi
|
|
.sp
|
|
.\"
|
|
.SS "\fB7. Builtin-variables\fR"
|
|
The following variables are built-in and initialized before program
|
|
execution.
|
|
.RS 5
|
|
.TP
|
|
.B ARGC
|
|
number of command line arguments.
|
|
.TP
|
|
.B ARGV
|
|
array of command line arguments, 0..ARGC\-1.
|
|
.TP
|
|
.B CONVFMT
|
|
format for internal conversion of numbers to string,
|
|
initially = "%.6g".
|
|
.TP
|
|
.B ENVIRON
|
|
array indexed by environment variables.
|
|
An environment string,
|
|
\fIvar=value\fR is stored as
|
|
\fBENVIRON\fR[\fIvar\fR] =
|
|
.IR value .
|
|
.TP
|
|
.B FILENAME
|
|
name of the current input file.
|
|
.TP
|
|
.B FNR
|
|
current record number in
|
|
.BR FILENAME .
|
|
.TP
|
|
.B FS
|
|
splits records into fields as a regular expression.
|
|
.TP
|
|
.B NF
|
|
number of fields in the current record.
|
|
.TP
|
|
.B NR
|
|
current record number in the total input stream.
|
|
.TP
|
|
.B OFMT
|
|
format for printing numbers; initially = "%.6g".
|
|
.TP
|
|
.B OFS
|
|
inserted between fields on output, initially = " ".
|
|
.TP
|
|
.B ORS
|
|
terminates each record on output, initially = "\en".
|
|
.TP
|
|
.B RLENGTH
|
|
length set by the last call to the built-in function,
|
|
.BR match() .
|
|
.TP
|
|
.B RS
|
|
input record separator, initially = "\en".
|
|
.TP
|
|
.B RSTART
|
|
index set by the last call to
|
|
.BR match() .
|
|
.TP
|
|
.B SUBSEP
|
|
used to build multiple array subscripts, initially = "\e034".
|
|
.RE
|
|
.\"
|
|
.SS "\fB8. Built-in functions"
|
|
.B String functions
|
|
.RS 5
|
|
.TP
|
|
gsub(\fIr,s,t\fR) gsub(\fIr,s\fR)
|
|
Global substitution, every match of regular expression
|
|
.I r
|
|
in variable
|
|
.I t
|
|
is replaced by string
|
|
.IR s .
|
|
The number of replacements is returned.
|
|
If
|
|
.I t
|
|
is omitted,
|
|
.B $0
|
|
is used.
|
|
An
|
|
.I &
|
|
in the replacement string
|
|
.I s
|
|
is replaced by the matched substring of
|
|
.IR t .
|
|
\e& and \e\e put literal & and \e, respectively,
|
|
in the replacement string.
|
|
.TP
|
|
index(\fIs,t\fR)
|
|
If
|
|
.I t
|
|
is a substring of
|
|
.IR s ,
|
|
then the position where
|
|
.I t
|
|
starts is returned, else 0 is returned.
|
|
The first character of
|
|
.I s
|
|
is in position 1.
|
|
.TP
|
|
length(\fIs\fR)
|
|
Returns the length of string or array
|
|
.IR s .
|
|
.TP
|
|
match(\fIs,r\fR)
|
|
Returns the index of the first longest match of regular expression
|
|
.I r
|
|
in string
|
|
.IR s .
|
|
Returns 0 if no match.
|
|
As a side effect,
|
|
.B RSTART
|
|
is set to the return value.
|
|
.B RLENGTH
|
|
is set to the length of the match or \-1 if no match.
|
|
If the empty string is matched,
|
|
.B RLENGTH
|
|
is set to 0, and 1 is returned if the match is at the front, and
|
|
length(\fIs\fR)+1 is returned if the match is at the back.
|
|
.TP
|
|
split(\fIs,A,r\fR) split(\fIs,A\fR)
|
|
String
|
|
.I s
|
|
is split into fields by regular expression
|
|
.I r
|
|
and the fields are loaded into array
|
|
.IR A .
|
|
The number of fields
|
|
is returned.
|
|
See section 11 below for more detail.
|
|
If
|
|
.I r
|
|
is omitted,
|
|
.B FS
|
|
is used.
|
|
.TP
|
|
sprintf(\fIformat,expr-list\fR)
|
|
Returns a string constructed from
|
|
.I expr-list
|
|
according to
|
|
.IR format .
|
|
See the description of printf() below.
|
|
.TP
|
|
sub(\fIr,s,t\fR) sub(\fIr,s\fR)
|
|
Single substitution, same as gsub() except at most one substitution.
|
|
.TP
|
|
substr(\fIs,i,n\fR) substr(\fIs,i\fR)
|
|
Returns the substring of string
|
|
.IR s ,
|
|
starting at index
|
|
.IR i ,
|
|
of length
|
|
.IR n .
|
|
If
|
|
.I n
|
|
is omitted, the suffix of
|
|
.IR s ,
|
|
starting at
|
|
.I i
|
|
is returned.
|
|
.TP
|
|
tolower(\fIs\fR)
|
|
Returns a copy of
|
|
.I s
|
|
with all upper case characters converted to lower case.
|
|
.TP
|
|
toupper(\fIs\fR)
|
|
Returns a copy of
|
|
.I s
|
|
with all lower case characters converted to upper case.
|
|
.RE
|
|
.PP
|
|
.B Time functions
|
|
.PP
|
|
These are available on systems which support the corresponding C
|
|
\fBmktime\fP and \fBstrftime\fP functions:
|
|
.RS 5
|
|
.TP
|
|
mktime(\fIspecification\fR)
|
|
converts a date specification to a timestamp
|
|
with the same units as \fBsystime\fP.
|
|
The date specification is a string containing the components of the
|
|
date as decimal integers:
|
|
.RS
|
|
.TP 3
|
|
.B YYYY
|
|
the year, e.g., 2012
|
|
.TP 3
|
|
.B MM
|
|
the month of the year starting at 1
|
|
.TP 3
|
|
.B DD
|
|
the day of the month starting at 1
|
|
.TP 3
|
|
.B HH
|
|
hour (0-23)
|
|
.TP 3
|
|
.B MM
|
|
minute (0-59)
|
|
.TP 3
|
|
.B SS
|
|
seconds (0-59)
|
|
.TP 3
|
|
.B DST
|
|
tells how to treat timezone versus daylight savings time:
|
|
.RS 5
|
|
.TP 3
|
|
positive
|
|
DST is in effect
|
|
.TP 3
|
|
zero (default)
|
|
DST is not in effect
|
|
.TP 3
|
|
negative
|
|
mktime()
|
|
should (use timezone information and system databases to) attempt to
|
|
determine whether DST is in effect at the specified time.
|
|
.RE
|
|
.RE
|
|
.TP
|
|
strftime([\fIformat\fR [, \fItimestamp\fP [, \fIutc\fP ]]])
|
|
formats the given timestamp using the format (passed to the C \fBstrftime\fP
|
|
function):
|
|
.RS
|
|
.bP
|
|
If the \fIformat\fP parameter is missing, "%c" is used.
|
|
.bP
|
|
If the \fItimestamp\fP parameter is missing, the current value from
|
|
\fBsystime\fP is used.
|
|
.bP
|
|
If the \fIutc\fP parameter is present and nonzero,
|
|
the result is in UTC.
|
|
Otherwise local time is used.
|
|
.RE
|
|
.TP
|
|
systime()
|
|
returns the current time of day as the number of seconds
|
|
since the Epoch (1970-01-01 00:00:00 UTC on POSIX systems).
|
|
.RE
|
|
.PP
|
|
.B Arithmetic functions
|
|
.RS 5
|
|
.ie n .ds Pi pi
|
|
.el .ds Pi \\(*p
|
|
.TP
|
|
atan2(\fIy,x\fR)
|
|
Arctan of \fIy\fR/\fIx\fR between \-\*(Pi and \*(Pi.
|
|
.TP
|
|
cos(\fIx\fR)
|
|
Cosine function, \fIx\fR in radians.
|
|
.TP
|
|
exp(\fIx\fR)
|
|
Exponential function.
|
|
.TP
|
|
int(\fIx\fR)
|
|
Returns \fIx\fR truncated towards zero.
|
|
.TP
|
|
log(\fIx\fR)
|
|
Natural logarithm.
|
|
.TP
|
|
rand()
|
|
Returns a random number between zero and one.
|
|
.TP
|
|
sin(\fIx\fR)
|
|
Sine function, \fIx\fR in radians.
|
|
.TP
|
|
sqrt(\fIx\fR)
|
|
Returns square root of \fIx\fR.
|
|
.TP
|
|
srand(\fIexpr\fR)
|
|
.TP
|
|
srand()
|
|
Seeds the random number generator,
|
|
using the clock if \fIexpr\fP is omitted,
|
|
and returns the value of the previous seed.
|
|
Srand(\fIexpr\fR) is useful for repeating pseudo random sequences.
|
|
.IP
|
|
Note:
|
|
\fB\*n\fP
|
|
is normally configured to seed the random number generator from the clock
|
|
at startup, making it unnecessary to call srand().
|
|
This feature can be suppressed via conditional compile,
|
|
or overridden using the \fB\-Wrandom\fP option.
|
|
.RE
|
|
.\"
|
|
.SS "\fB9. Input and output"
|
|
There are two output statements,
|
|
.B print
|
|
and
|
|
.BR printf .
|
|
.RS 5
|
|
.TP
|
|
print
|
|
writes
|
|
.B "$0 ORS"
|
|
to standard output.
|
|
.TP
|
|
print \*(ex\d1\u, \*(ex\d2\u, ..., \*(ex\dn\u
|
|
writes
|
|
\*(ex\d1\u \fBOFS \*(ex\d2\u \fBOFS\fR ... \*(ex\dn\u
|
|
.B ORS
|
|
to standard output.
|
|
Numeric expressions are converted to string with
|
|
.BR OFMT .
|
|
.TP
|
|
printf \fIformat, expr-list\fR
|
|
duplicates the printf C library function writing to standard output.
|
|
The complete ANSI C format specifications are recognized with
|
|
conversions %c, %d, %e, %E, %f, %g, %G,
|
|
%i, %o, %s, %u, %x, %X and %%,
|
|
and conversion qualifiers h and l.
|
|
.RE
|
|
.PP
|
|
The argument list to print or printf can optionally be enclosed in
|
|
parentheses.
|
|
Print formats numbers using
|
|
.B OFMT
|
|
or "%d" for exact integers.
|
|
"%c" with a numeric argument prints the corresponding 8 bit
|
|
character, with a string argument it prints the first character of
|
|
the string.
|
|
The output of print and printf can be redirected to a file or
|
|
command by appending >
|
|
.IR file ,
|
|
>>
|
|
.I file
|
|
or
|
|
|
|
|
.I command
|
|
to the end of the print statement.
|
|
Redirection opens
|
|
.I file
|
|
or
|
|
.I command
|
|
only once, subsequent redirections append to the already open stream.
|
|
By convention,
|
|
\fB\*n\fP
|
|
associates the filename
|
|
.RS 3
|
|
.bP
|
|
"/dev/stderr" with stderr,
|
|
.bP
|
|
"/dev/stdout" with stdout,
|
|
.bP
|
|
"\-" and "/dev/stdin" with stdin.
|
|
.RE
|
|
.PP
|
|
The association with stderr is especially useful because it allows
|
|
print and printf to be redirected to stderr.
|
|
These names can also be passed to functions.
|
|
.PP
|
|
The input function
|
|
.B getline
|
|
has the following variations.
|
|
.RS 5
|
|
.TP
|
|
getline
|
|
reads into
|
|
.BR $0 ,
|
|
updates the fields,
|
|
.BR NF ,
|
|
.B NR
|
|
and
|
|
.BR FNR .
|
|
.TP
|
|
getline < \fIfile\fR
|
|
reads into
|
|
.B $0
|
|
from \fIfile\fR,
|
|
updates the fields and
|
|
.BR NF .
|
|
.TP
|
|
getline \fIvar\fR
|
|
reads the next record into
|
|
.IR var ,
|
|
updates
|
|
.B NR
|
|
and
|
|
.BR FNR .
|
|
.TP
|
|
getline \fIvar\fR < \fIfile\fR
|
|
reads the next record of
|
|
.I file
|
|
into
|
|
.IR var .
|
|
.TP
|
|
\fIcommand\fR | getline
|
|
pipes a record from
|
|
.I command
|
|
into
|
|
.B $0
|
|
and updates the fields and
|
|
.BR NF .
|
|
.TP
|
|
\fIcommand\fR | getline \fIvar\fR
|
|
pipes a record from
|
|
.I command
|
|
into
|
|
.IR var .
|
|
.RE
|
|
.PP
|
|
Getline returns 0 on end-of-file, \-1 on error, otherwise 1.
|
|
.PP
|
|
Commands on the end of pipes are executed by /bin/sh.
|
|
.PP
|
|
The function \fBclose\fR(\*(ex) closes the file or pipe
|
|
associated with
|
|
.IR expr .
|
|
Close returns 0 if
|
|
.I expr
|
|
is an open file,
|
|
the exit status if
|
|
.I expr
|
|
is a piped command, and \-1 otherwise.
|
|
Close is used to reread a file or command, make sure the other
|
|
end of an output pipe is finished or conserve file resources.
|
|
.PP
|
|
The function \fBfflush\fR(\*(ex) flushes the output file or pipe
|
|
associated with
|
|
.IR expr .
|
|
Fflush returns 0 if
|
|
.I expr
|
|
is an open output stream else \-1.
|
|
Fflush without an argument flushes stdout.
|
|
Fflush with an empty argument ("") flushes all open output.
|
|
.PP
|
|
The function
|
|
\fBsystem\fR(\fIexpr\fR)
|
|
uses the C runtime \fBsystem\fP call to execute
|
|
.I expr
|
|
and returns the corresponding wait status of the command as follows:
|
|
.bP
|
|
if the \fBsystem\fP call failed, setting the status to -1,
|
|
\fB\*n\fP returns that value.
|
|
.bP
|
|
if the command exited normally,
|
|
\fB\*n\fP returns its exit-status.
|
|
.bP
|
|
if the command exited due to a signal such as \fBSIGHUP\fP,
|
|
\fB\*n\fP returns the signal number plus 256.
|
|
.PP
|
|
Changes made to the
|
|
.B ENVIRON
|
|
array are not passed to commands executed with
|
|
.B system
|
|
or pipes.
|
|
.SS "\fB10. User defined functions"
|
|
The syntax for a user defined function is
|
|
.nf
|
|
.sp
|
|
\fBfunction\fR name( \fIargs\fR ) { \fIstatements\fR }
|
|
.sp
|
|
.fi
|
|
The function body can contain a return statement
|
|
.nf
|
|
.sp
|
|
\fBreturn\fI opt_expr\fR
|
|
.sp
|
|
.fi
|
|
A return statement is not required.
|
|
Function calls may be nested or recursive.
|
|
Functions are passed expressions by value
|
|
and arrays by reference.
|
|
Extra arguments serve as local variables
|
|
and are initialized to
|
|
.IR null .
|
|
For example, csplit(\fIs,\|A\fR) puts each character of
|
|
.I s
|
|
into array
|
|
.I A
|
|
and returns the length of
|
|
.IR s .
|
|
.nf
|
|
.sp
|
|
function csplit(s, A, n, i)
|
|
{
|
|
n = length(s)
|
|
for( i = 1 ; i <= n ; i++ ) A[i] = substr(s, i, 1)
|
|
return n
|
|
}
|
|
.sp
|
|
.fi
|
|
Putting extra space between passed arguments and local
|
|
variables is conventional.
|
|
Functions can be referenced before they are defined, but the
|
|
function name and the '(' of the arguments must touch to
|
|
avoid confusion with concatenation.
|
|
.sp
|
|
A function parameter is normally a scalar value (number or string).
|
|
If there is a forward reference to a function using an array as a parameter,
|
|
the function's corresponding parameter will be treated as an array.
|
|
.\"
|
|
.SS "\fB11. Splitting strings, records and files"
|
|
Awk programs use the same algorithm to
|
|
split strings into arrays with split(), and records into fields
|
|
on
|
|
.BR FS .
|
|
\fB\*n\fP
|
|
uses essentially the same algorithm to split files into
|
|
records on
|
|
.BR RS .
|
|
.PP
|
|
Split(\fIexpr,\|A,\|sep\fR) works as follows:
|
|
.RS 3
|
|
.TP 5
|
|
(1)
|
|
If
|
|
.I sep
|
|
is omitted, it is replaced by
|
|
.BR FS .
|
|
.I Sep
|
|
can be an expression or regular expression.
|
|
If it is an expression of non-string type, it is converted to string.
|
|
.TP
|
|
(2)
|
|
If
|
|
.I sep
|
|
= " " (a single space),
|
|
then <SPACE> is trimmed from the front and back of
|
|
.IR expr ,
|
|
and
|
|
.I sep
|
|
becomes <SPACE>.
|
|
\fB\*n\fP
|
|
defines <SPACE> as the regular expression
|
|
/[\ \et\en]+/.
|
|
Otherwise
|
|
.I sep
|
|
is treated as a regular expression, except that meta-characters
|
|
are ignored for a string of length 1,
|
|
e.g.,
|
|
split(x, A, "*") and split(x, A, /\e*/) are the same.
|
|
.TP
|
|
(3)
|
|
If \*(ex is not string, it is converted to string.
|
|
If \*(ex is then the empty string "", split() returns 0
|
|
and
|
|
.I A
|
|
is set empty.
|
|
Otherwise,
|
|
all non-overlapping, non-null and longest matches of
|
|
.I sep
|
|
in
|
|
.IR expr ,
|
|
separate
|
|
.I expr
|
|
into fields which are loaded into
|
|
.IR A .
|
|
The fields are placed in
|
|
A[1], A[2], ..., A[n] and split() returns n, the number
|
|
of fields which is the number
|
|
of matches plus one.
|
|
Data placed in
|
|
.I A
|
|
that looks numeric is typed number and string.
|
|
.RE
|
|
.PP
|
|
Splitting records into fields works the same except the
|
|
pieces are loaded into
|
|
.BR $1 ,
|
|
\fB$2\fR,...,
|
|
.BR $NF .
|
|
If
|
|
.B $0
|
|
is empty,
|
|
.B NF
|
|
is set to 0 and all
|
|
.B $i
|
|
to "".
|
|
.PP
|
|
\fB\*n\fP
|
|
splits files into records by the same algorithm, but with the
|
|
slight difference that
|
|
.B RS
|
|
is really a terminator instead of a separator.
|
|
(\fBORS\fR is really a terminator too).
|
|
.RS 5
|
|
.PP
|
|
E.g., if
|
|
.B FS
|
|
= \*(``:+\*('' and
|
|
.B $0
|
|
= \*(``a::b:\*('' , then
|
|
.B NF
|
|
= 3 and
|
|
.B $1
|
|
= \*(``a\*('',
|
|
.B $2
|
|
= \*(``b\*('' and
|
|
.B $3
|
|
= "", but
|
|
if \*(``a::b:\*('' is the contents of an input file and
|
|
.B RS
|
|
= \*(``:+\*('', then
|
|
there are two records \*(``a\*('' and \*(``b\*(''.
|
|
.RE
|
|
.PP
|
|
.B RS
|
|
= " " is not special.
|
|
.PP
|
|
If
|
|
.B FS
|
|
= "", then
|
|
\fB\*n\fP
|
|
breaks the record into individual characters, and, similarly,
|
|
split(\fIs,A,\fR"") places the individual characters of
|
|
.I s
|
|
into
|
|
.IR A .
|
|
.\"
|
|
.SS "\fB12. Multi-line records"
|
|
Since
|
|
\fB\*n\fP
|
|
interprets
|
|
.B RS
|
|
as a regular expression, multi-line
|
|
records are easy.
|
|
Setting
|
|
.B RS
|
|
= "\en\en+", makes one or more blank
|
|
lines separate records.
|
|
If
|
|
.B FS
|
|
= " " (the default), then single
|
|
newlines, by the rules for <SPACE> above, become space and
|
|
single newlines are field separators.
|
|
.RS 5
|
|
.PP
|
|
For example, if
|
|
.bP
|
|
a file is "a\ b\enc\en\en",
|
|
.bP
|
|
\fBRS\fP = "\en\en+" and
|
|
.bP
|
|
\fBFS\fP = "\ ",
|
|
.PP
|
|
then there is one record \*(``a\ b\enc\*('' with three
|
|
fields \*(``a\*('', \*(``b\*('' and \*(``c\*('':
|
|
.bP
|
|
using
|
|
.B FS
|
|
= \*(``\en\*('', gives two
|
|
fields \*(``a b\*('' and \*(``c\*('';
|
|
.bP
|
|
using
|
|
.B FS
|
|
= \*(``\*('', gives one field
|
|
identical to the record.
|
|
.RE
|
|
.PP
|
|
If you want lines with spaces or tabs to be considered blank,
|
|
set
|
|
.B RS
|
|
= \*(``\en([\ \et]*\en)+\*(''.
|
|
For compatibility with other awks, setting
|
|
.B RS
|
|
= "" has the same
|
|
effect as if blank lines are stripped from the
|
|
front and back of files and then records are determined as if
|
|
.B RS
|
|
= \*(``\en\en+\*(''.
|
|
POSIX requires that \*(``\en\*('' always separates records when
|
|
.B RS
|
|
= "" regardless of the value of
|
|
.BR FS .
|
|
\fB\*n\fP
|
|
does not support this convention, because defining
|
|
\*(``\en\*('' as <SPACE> makes it unnecessary.
|
|
.\"
|
|
.PP
|
|
Most of the time when you change
|
|
.B RS
|
|
for multi-line records, you
|
|
will also want to change
|
|
.B ORS
|
|
to \*(``\en\en\*('' so the record spacing is preserved on output.
|
|
.\"
|
|
.SS "\fB13. Program execution"
|
|
This section describes the order of program execution.
|
|
First
|
|
.B ARGC
|
|
is set to the total number of command line arguments passed to
|
|
the execution phase of the program.
|
|
.bP
|
|
.B ARGV[0]
|
|
is set to the name of the AWK interpreter and
|
|
.bP
|
|
\fBARGV[1]\fR ...
|
|
.B ARGV[ARGC\-1]
|
|
holds the remaining command line arguments exclusive of
|
|
options and program source.
|
|
.PP
|
|
For example, with
|
|
.nf
|
|
.sp
|
|
\*n \-f prog v=1 A t=hello B
|
|
.sp
|
|
.fi
|
|
.B ARGC
|
|
= 5 with
|
|
.RS
|
|
.B ARGV[0]
|
|
= "\*n",
|
|
.br
|
|
.B ARGV[1]
|
|
= "v=1",
|
|
.br
|
|
.B ARGV[2]
|
|
= "A",
|
|
.br
|
|
.B ARGV[3]
|
|
= "t=hello" and
|
|
.br
|
|
.B ARGV[4]
|
|
= "B".
|
|
.RE
|
|
.PP
|
|
Next, each
|
|
.B BEGIN
|
|
block is executed in order.
|
|
If the program consists
|
|
entirely of
|
|
.B BEGIN
|
|
blocks, then execution terminates, else
|
|
an input stream is opened and execution continues.
|
|
If
|
|
.B ARGC
|
|
equals 1,
|
|
the input stream is set to stdin,
|
|
else the command line arguments
|
|
.BR ARGV[1] " ..."
|
|
.B ARGV[ARGC\-1]
|
|
are examined for a file argument.
|
|
.PP
|
|
The command line arguments divide into three sets:
|
|
file arguments, assignment arguments and empty strings "".
|
|
An assignment has the form
|
|
\fIvar\fR=\fIstring\fR.
|
|
When an
|
|
.B ARGV[i]
|
|
is examined as a possible file argument,
|
|
if it is empty it is skipped;
|
|
if it is an assignment argument, the assignment to
|
|
.I var
|
|
takes place and
|
|
.B i
|
|
skips to the next argument;
|
|
else
|
|
.B ARGV[i]
|
|
is opened for input.
|
|
If it fails to open, execution terminates with exit code 2.
|
|
If no command line argument is a file argument, then input
|
|
comes from stdin.
|
|
Getline in a
|
|
.B BEGIN
|
|
action opens input.
|
|
\*(``\-\*('' as a file argument denotes stdin.
|
|
.PP
|
|
Once an input stream is open, each input record is tested
|
|
against each
|
|
.IR pattern ,
|
|
and if it matches, the associated
|
|
.I action
|
|
is executed.
|
|
An expression pattern matches if it is boolean true (see
|
|
the end of section 2).
|
|
A
|
|
.B BEGIN
|
|
pattern matches before any input has been read, and
|
|
an
|
|
.B END
|
|
pattern matches after all input has been read.
|
|
A range pattern,
|
|
\fIexpr\fR1,\|\fIexpr\fR2 ,
|
|
matches every record between the match of
|
|
.IR expr 1
|
|
and the match
|
|
.IR expr 2
|
|
inclusively.
|
|
.PP
|
|
When end of file occurs on the input stream, the remaining
|
|
command line arguments are examined for a file argument, and
|
|
if there is one it is opened, else the
|
|
.B END
|
|
.I pattern
|
|
is considered matched
|
|
and all
|
|
.B END
|
|
.I actions
|
|
are executed.
|
|
.PP
|
|
In the example, the assignment
|
|
v=1
|
|
takes place after the
|
|
.B BEGIN
|
|
.I actions
|
|
are executed, and
|
|
the data placed in
|
|
v
|
|
is typed number and string.
|
|
Input is then read from file A.
|
|
On end of file A,
|
|
t
|
|
is set to the string "hello",
|
|
and B is opened for input.
|
|
On end of file B, the
|
|
.B END
|
|
.I actions
|
|
are executed.
|
|
.PP
|
|
Program flow at the
|
|
.I pattern
|
|
.I {action}
|
|
level can be changed with the
|
|
.nf
|
|
.sp
|
|
\fBnext
|
|
\fBnextfile
|
|
\fBexit \fIopt_expr\fR
|
|
.sp
|
|
.fi
|
|
statements:
|
|
.bP
|
|
A
|
|
.B next
|
|
statement
|
|
causes the next input record to be read and pattern testing
|
|
to restart with the first
|
|
.I "pattern {action}"
|
|
pair in the program.
|
|
.bP
|
|
A
|
|
.B nextfile
|
|
statement tells \fB\*n\fP to stop processing the current input file.
|
|
It then updates FILENAME to the next file listed on the command line,
|
|
and resets FNR to 1.
|
|
.bP
|
|
An
|
|
.B exit
|
|
statement
|
|
causes immediate execution of the
|
|
.B END
|
|
actions or program termination if there are none or
|
|
if the
|
|
.B exit
|
|
occurs in an
|
|
.B END
|
|
action.
|
|
The
|
|
.I opt_expr
|
|
sets the exit value of the program unless overridden by
|
|
a later
|
|
.B exit
|
|
or subsequent error.
|
|
.\" **************************************************************************
|
|
.SH ENVIRONMENT
|
|
\fB\*N\fP recognizes these variables:
|
|
.RS 3
|
|
.TP 3
|
|
MAWKBINMODE
|
|
(see \fBCOMPATIBILITY\fP)
|
|
.TP
|
|
MAWK_LONG_OPTIONS
|
|
If this is set, \fB\*n\fP uses its value to decide what to do with
|
|
GNU-style long options:
|
|
.RS 5
|
|
.TP
|
|
allow
|
|
\fB\*N\fP allows the option to be checked against the (small) set of
|
|
long options it recognizes.
|
|
.IP
|
|
The long names from the \fB\-W\fP option are recognized,
|
|
e.g.,
|
|
\fB\-\-version\fP is derived from
|
|
\fB\-Wversion\fP.
|
|
.TP
|
|
error
|
|
\fB\*N\fP prints an error message and exits.
|
|
This is the default.
|
|
.TP
|
|
ignore
|
|
\fB\*N\fP ignores the option,
|
|
unless it happens to be one of the one it recognizes.
|
|
.TP
|
|
warn
|
|
Print an warning message and otherwise ignore the option.
|
|
.RE
|
|
.IP
|
|
If the variable is unset, \fB\*n\fP prints an error message and exits.
|
|
.TP
|
|
WHINY_USERS
|
|
This is a \fBgawk\fP 3.1.0 feature, removed in the 4.0.0 release.
|
|
It tells \fB\*n\fP to sort array indices before it starts to iterate
|
|
over the elements of an array.
|
|
.RE
|
|
.\" **************************************************************************
|
|
.SH "COMPATIBILITY"
|
|
.SS "MAWK 1.3.3 versus POSIX 1003.2 Draft 11.3"
|
|
The POSIX 1003.2(draft 11.3) definition of the AWK language
|
|
is AWK as described in the AWK book with a few extensions
|
|
that appeared in SystemVR4 nawk.
|
|
The extensions are:
|
|
.RS 3
|
|
.bP
|
|
New functions: toupper() and tolower().
|
|
.bP
|
|
New variables: ENVIRON[\|] and CONVFMT.
|
|
.bP
|
|
ANSI C conversion specifications for printf() and sprintf().
|
|
.bP
|
|
New command options: \-v var=value, multiple \-f options and
|
|
implementation options as arguments to \-W.
|
|
.bP
|
|
For systems (MS-DOS or Windows) which provide a \fIsetmode\fP function,
|
|
an environment variable MAWKBINMODE and a built-in variable BINMODE.
|
|
The bits of the BINMODE value tell \fB\*n\fP how to modify the
|
|
\fBRS\fP and \fBORS\fP variables:
|
|
.RS
|
|
.TP 3
|
|
0
|
|
set standard input to binary mode,
|
|
and if BIT-2 is unset, set \fBRS\fP to "\\r\\n" (CR/LF) rather than "\\n" (LF).
|
|
.TP 3
|
|
1
|
|
set standard output to binary mode,
|
|
and if BIT-2 is unset, set \fBORS\fP to "\\r\\n" (CR/LF) rather than "\\n" (LF).
|
|
.TP 3
|
|
2
|
|
suppress the assignment to \fBRS\fP and \fBORS\fP of CR/LF,
|
|
making it possible to run scripts and generate output compatible
|
|
with Unix line-endings.
|
|
.RE
|
|
.RE
|
|
.sp
|
|
POSIX AWK is oriented to operate on files a line at
|
|
a time.
|
|
.B RS
|
|
can be changed from "\en" to another single character,
|
|
but it
|
|
is hard to find any use for this \(em there are no
|
|
examples in the AWK book.
|
|
By convention, \fBRS\fR = "", makes one or more blank lines
|
|
separate records, allowing multi-line records.
|
|
When \fBRS\fR = "", "\en" is always a field separator
|
|
regardless of the value in
|
|
.BR FS .
|
|
.PP
|
|
.BR \*n ,
|
|
on the other hand,
|
|
allows
|
|
.B RS
|
|
to be a regular expression.
|
|
When "\en" appears in records, it is treated as space, and
|
|
.B FS
|
|
always determines fields.
|
|
.PP
|
|
Removing the line at a time paradigm can make some programs
|
|
simpler and can
|
|
often improve performance.
|
|
For example, redoing example 3 from above,
|
|
.nf
|
|
.sp
|
|
BEGIN { RS = "[^A\-Za\-z]+" }
|
|
|
|
{ word[ $0 ] = "" }
|
|
|
|
END { delete word[ "" ]
|
|
for( i in word ) cnt++
|
|
print cnt
|
|
}
|
|
.sp
|
|
.fi
|
|
counts the number of unique words by making each word a record.
|
|
On moderate size files,
|
|
\fB\*n\fP
|
|
executes twice as fast, because of the simplified inner loop.
|
|
.PP
|
|
The following program replaces each comment by a single space in
|
|
a C program file,
|
|
.nf
|
|
.sp
|
|
BEGIN {
|
|
RS = "/\|\e*([^*]\||\|\e*+[^/*])*\e*+/"
|
|
# comment is record separator
|
|
ORS = " "
|
|
getline hold
|
|
}
|
|
|
|
{ print hold ; hold = $0 }
|
|
|
|
END { printf "%s" , hold }
|
|
.sp
|
|
.fi
|
|
Buffering one record is needed to avoid terminating the last
|
|
record with a space.
|
|
.PP
|
|
With
|
|
.BR \*n ,
|
|
the following are all equivalent,
|
|
.nf
|
|
.sp
|
|
x ~ /a\e+b/ x ~ "a\e+b" x ~ "a\e\e+b"
|
|
.sp
|
|
.fi
|
|
The strings get scanned twice, once as string and once as
|
|
regular expression.
|
|
On the string scan,
|
|
\fB\*n\fP ignores the escape on non-escape characters while the AWK
|
|
book advocates
|
|
.I \ec
|
|
be recognized as
|
|
.I c
|
|
which necessitates the double escaping of meta-characters in
|
|
strings.
|
|
POSIX explicitly declines to define the behavior which passively
|
|
forces programs that must run under a variety of awks to use
|
|
the more portable but less readable, double escape.
|
|
.PP
|
|
POSIX AWK does not recognize "/dev/std{in,out,err}".
|
|
Some systems provide an actual device for this,
|
|
allowing AWKs which do not implement the feature directly to support it.
|
|
.PP
|
|
POSIX AWK does not recognize \ex hex escape
|
|
sequences in strings.
|
|
Unlike ANSI C,
|
|
\fB\*n\fP limits the number of digits that follows \ex to two as the current
|
|
implementation only supports 8 bit characters.
|
|
.PP
|
|
POSIX explicitly leaves the behavior of
|
|
.B FS
|
|
= "" undefined, and mentions splitting the record into characters as
|
|
a possible interpretation, but currently this use is not portable
|
|
across implementations.
|
|
.PP
|
|
Some features were not part of the POSIX standard until long after
|
|
their introduction in \fB\*n\fP and other implementations.
|
|
These were published in IEEE 1003.1-2024
|
|
(The Open Group Base Specifications Issue 8):
|
|
.bP
|
|
The built-in
|
|
.B fflush
|
|
first appeared in a 1993 AT&T awk released to netlib.
|
|
It was approved for the POSIX standard in 2012.
|
|
.bP
|
|
The built-in
|
|
.B nextfile
|
|
first appeared in gawk in 1988,
|
|
was adopted by BWK in 1996,
|
|
and by mawk in 2012.
|
|
It was approved for the POSIX standard in 2012.
|
|
.bP
|
|
Aggregate deletion with
|
|
.B delete
|
|
.I array
|
|
was approved in 2018.
|
|
.SS "Random numbers"
|
|
POSIX does not prescribe a method for initializing random numbers at startup.
|
|
.PP
|
|
In practice, most implementations do nothing special,
|
|
which makes \fBsrand\fP and \fBrand\fP follow the C runtime library,
|
|
making the initial seed value 1.
|
|
Some implementations (Solaris XPG4 and Tru64)
|
|
return 0 from the first call to \fBsrand\fP,
|
|
although the results from \fBrand\fP behave as if the initial seed is 1.
|
|
Other implementations return 1.
|
|
.PP
|
|
While
|
|
\fB\*n\fP
|
|
can call \fBsrand\fP at startup with no parameter
|
|
(initializing random numbers from the clock),
|
|
this feature may be suppressed using conditional compilation.
|
|
.
|
|
.SS "Extensions added for compatibility for GAWK and BWK"
|
|
.BR Mktime ,
|
|
.BR strftime \ and
|
|
.B systime
|
|
are \fBgawk\fP extensions.
|
|
.PP
|
|
The "/dev/stdin" feature was added to \fB\*n\fP after 1.3.4,
|
|
for compatibility with \fBgawk\fP and BWK awk.
|
|
The corresponding "-" (alias for /dev/stdin) was present in \fB\*n\fR 1.3.3.
|
|
.PP
|
|
Interval expressions,
|
|
e.g., a range \fI{m,n}\fP in Extended Regular Expressions (EREs),
|
|
were not supported in awk (or even the original \*(``nawk\*(''):
|
|
.bP
|
|
Gawk provided this feature in 1991 (and later, in 1998,
|
|
options for turning it off,
|
|
for compatibility with \*(``traditional awk\*('').
|
|
.bP
|
|
Interval expressions,
|
|
were introduced into \fIawk\fP regular expressions
|
|
in IEEE 1003.1-2001 (also known as Unix 03),
|
|
along with some internationalization features.
|
|
.bP
|
|
Apple modified its copy of the original awk in April 2006,
|
|
making this version of awk support interval expressions.
|
|
.IP
|
|
The updated source provides for compatibility with
|
|
older \*(``legacy\*('' versions using an environment variable,
|
|
making this \*(``Unix 2003\*('' feature
|
|
(perhaps meant as Unix 03)
|
|
the default.
|
|
.bP
|
|
NetBSD developers copied this change in January 2018,
|
|
omitting the compatibility option,
|
|
and then applied it to BWK awk.
|
|
.bP
|
|
The interval expression implementation in \fB\*n\fP
|
|
is based on changes proposed by James Parkinson in April 2016.
|
|
.PP
|
|
\fB\*N\fP also recognizes a few gawk-specific command line options
|
|
for script compatibility:
|
|
.RS 5
|
|
.sp
|
|
.hy 0
|
|
.na
|
|
\fB\-\-help\fP,
|
|
\fB\-\-posix\fP,
|
|
\fB\-r\fP,
|
|
\fB\-\-re\-interval\fP,
|
|
\fB\-\-traditional\fP,
|
|
\fB\-\-version\fP
|
|
.ad
|
|
.hy
|
|
.RE
|
|
.
|
|
.SS "Subtle Differences not in POSIX or the AWK Book"
|
|
Finally, here is how
|
|
\fB\*n\fP
|
|
handles exceptional cases not discussed in the
|
|
AWK book or the POSIX draft.
|
|
It is unsafe to assume
|
|
consistency across awks and safe to skip to
|
|
the next section.
|
|
.PP
|
|
.RS 3
|
|
.bP
|
|
substr(s, i, n) returns the characters of s in the intersection
|
|
of the closed interval [1, length(s)] and the half-open interval [i, i+n).
|
|
When this intersection is empty, the empty string is
|
|
returned; so substr("ABC", 1, 0) = "" and
|
|
substr("ABC", \-4, 6) = "A".
|
|
.bP
|
|
Every string, including the empty string, matches the empty string
|
|
at the
|
|
front so, s ~ // and s ~ "", are always 1 as is match(s, //) and
|
|
match(s, "").
|
|
The last two set
|
|
.B RLENGTH
|
|
to 0.
|
|
.bP
|
|
index(s, t) is always the same as match(s, t1) where t1 is the
|
|
same as t with metacharacters escaped.
|
|
Hence consistency
|
|
with match requires that
|
|
index(s, "") always returns 1.
|
|
Also the condition, index(s,t) != 0 if and only t is a substring
|
|
of s, requires index("","") = 1.
|
|
.bP
|
|
If getline encounters end of file, getline var, leaves var
|
|
unchanged.
|
|
Similarly, on entry to the
|
|
.B END
|
|
actions,
|
|
.BR $0 ,
|
|
the fields and
|
|
.B NF
|
|
have their value unaltered from the last record.
|
|
.RE
|
|
.\" **************************************************************************
|
|
.SH BUGS
|
|
\fB\*n\fP
|
|
implements \fBprintf()\fR and \fBsprintf()\fR using the C library functions,
|
|
printf and sprintf, so full ANSI compatibility requires an ANSI
|
|
C library.
|
|
In practice this means the h conversion qualifier may not be available.
|
|
.sp
|
|
Also \fB\*n\fP inherits any bugs or limitations of the library functions.
|
|
.PP
|
|
Implementors of the AWK language have shown a consistent lack
|
|
of imagination when naming their programs.
|
|
.\" **************************************************************************
|
|
.SH EXAMPLES
|
|
.nf
|
|
1. emulate cat.
|
|
|
|
{ print }
|
|
|
|
2. emulate wc.
|
|
|
|
{ chars += length($0) + 1 # add one for the \en
|
|
words += NF
|
|
}
|
|
|
|
END{ print NR, words, chars }
|
|
|
|
3. count the number of unique \*(``real words\*(''.
|
|
|
|
BEGIN { FS = "[^A\-Za\-z]+" }
|
|
|
|
{ for(i = 1 ; i <= NF ; i++) word[$i] = "" }
|
|
|
|
END { delete word[""]
|
|
for ( i in word ) cnt++
|
|
print cnt
|
|
}
|
|
|
|
.fi
|
|
4. sum the second field of
|
|
every record based on the first field.
|
|
.nf
|
|
|
|
$1 ~ /credit\||\|gain/ { sum += $2 }
|
|
$1 ~ /debit\||\|loss/ { sum \-= $2 }
|
|
|
|
END { print sum }
|
|
|
|
5. sort a file, comparing as string
|
|
|
|
{ line[NR] = $0 "" } # make sure of comparison type
|
|
# in case some lines look numeric
|
|
|
|
END { isort(line, NR)
|
|
for(i = 1 ; i <= NR ; i++) print line[i]
|
|
}
|
|
|
|
#insertion sort of A[1..n]
|
|
function isort( A, n, i, j, hold)
|
|
{
|
|
for( i = 2 ; i <= n ; i++)
|
|
{
|
|
hold = A[j = i]
|
|
while ( A[j\-1] > hold )
|
|
{ j\-\|\- ; A[j+1] = A[j] }
|
|
A[j] = hold
|
|
}
|
|
# sentinel A[0] = "" will be created if needed
|
|
}
|
|
|
|
.fi
|
|
.\" **************************************************************************
|
|
.SH AUTHORS
|
|
Mike Brennan (brennan@whidbey.com).
|
|
.br
|
|
Thomas E. Dickey <dickey@invisible-island.net>.
|
|
.\" **************************************************************************
|
|
.SH SEE ALSO
|
|
\fBgrep\fP(1)
|
|
.PP
|
|
Aho, Kernighan and Weinberger,
|
|
.IR "The AWK Programming Language" ,
|
|
Addison-Wesley Publishing, 1988, (the AWK book),
|
|
defines the language, opening with a tutorial
|
|
and advancing to many interesting programs that delve into
|
|
issues of software design and analysis relevant to programming
|
|
in any language.
|
|
.PP
|
|
.IR "The GAWK Manual" ,
|
|
The Free Software Foundation, 1991, is a tutorial
|
|
and language reference
|
|
that does not attempt the depth of the AWK book
|
|
and assumes the reader may be a novice programmer.
|
|
The section on AWK arrays is excellent.
|
|
It also discusses POSIX requirements for AWK.
|
|
.PP
|
|
\fBmawk-arrays\fP(7) discusses \fB\*n\fP's implementation of arrays.
|
|
.PP
|
|
\fBmawk-code\fP(7) gives more information on the \fB\-W\ dump\fP option.
|
|
.PP
|
|
\fIawk \(en pattern scanning and processing language\fP
|
|
.br
|
|
The Open Group Base Specifications Issue 8
|
|
.br
|
|
IEEE Std 1003.1-2024
|
|
.br
|
|
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/awk.html
|