mirror of
https://https.git.savannah.gnu.org/git/coreutils.git
synced 2026-01-26 15:29:07 +00:00
wc: treat non breaking space as a word separator
* src/wc.c (iswnbspace): A new function to match characters in this class. (isnbspace): Likewise for single byte charsets. (main): Initialize posixly_correct from the environment, to allow disabling honoring NBSP in non C locales. (wc): Call is[w]nbspace() along with is[w]space. * bootstrap.conf: Ensure btowc is available. * tests/misc/wc-nbsp.sh: A new test. * tests/local.mk: Reference the new test. * NEWS: Mention the change in behavior.
This commit is contained in:
parent
2ab2f7a422
commit
a5202bd585
3
NEWS
3
NEWS
@ -67,6 +67,9 @@ GNU coreutils NEWS -*- outline -*-
|
||||
operator, so POSIX changed this to 'test -e FILE'. Scripts using it were
|
||||
already broken and non-portable; the -a unary operator was never documented.
|
||||
|
||||
wc now treats non breaking space characters as word delimiters
|
||||
unless the POSIXLY_CORRECT environment variable is set.
|
||||
|
||||
** New features
|
||||
|
||||
id now supports specifying multiple users.
|
||||
|
||||
@ -38,6 +38,7 @@ gnulib_modules="
|
||||
backup-rename
|
||||
base32
|
||||
base64
|
||||
btowc
|
||||
buffer-lcm
|
||||
c-strcase
|
||||
cl-strtod
|
||||
|
||||
25
src/wc.c
25
src/wc.c
@ -74,6 +74,9 @@ static bool have_read_stdin;
|
||||
/* Used to determine if file size can be determined without reading. */
|
||||
static size_t page_size;
|
||||
|
||||
/* Enable to _not_ treat non breaking space as a word separator. */
|
||||
static bool posixly_correct;
|
||||
|
||||
/* The result of calling fstat or stat on a file descriptor or file. */
|
||||
struct fstatus
|
||||
{
|
||||
@ -147,6 +150,21 @@ the following order: newline, word, character, byte, maximum line length.\n\
|
||||
exit (status);
|
||||
}
|
||||
|
||||
/* Return non zero if a non breaking space. */
|
||||
static int _GL_ATTRIBUTE_PURE
|
||||
iswnbspace (wint_t wc)
|
||||
{
|
||||
return ! posixly_correct
|
||||
&& (wc == 0x00A0 || wc == 0x2007
|
||||
|| wc == 0x202F || wc == 0x2060);
|
||||
}
|
||||
|
||||
static int
|
||||
isnbspace (int c)
|
||||
{
|
||||
return iswnbspace (btowc (c));
|
||||
}
|
||||
|
||||
/* FILE is the name of the file (or NULL for standard input)
|
||||
associated with the specified counters. */
|
||||
static void
|
||||
@ -455,7 +473,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
if (width > 0)
|
||||
linepos += width;
|
||||
}
|
||||
if (iswspace (wide_char))
|
||||
if (iswspace (wide_char) || iswnbspace (wide_char))
|
||||
goto mb_word_separator;
|
||||
in_word = true;
|
||||
}
|
||||
@ -538,7 +556,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
if (isprint (to_uchar (p[-1])))
|
||||
{
|
||||
linepos++;
|
||||
if (isspace (to_uchar (p[-1])))
|
||||
if (isspace (to_uchar (p[-1]))
|
||||
|| isnbspace (to_uchar (p[-1])))
|
||||
goto word_separator;
|
||||
in_word = true;
|
||||
}
|
||||
@ -681,6 +700,8 @@ main (int argc, char **argv)
|
||||
so that processes running in parallel do not intersperse their output. */
|
||||
setvbuf (stdout, NULL, _IOLBF, 0);
|
||||
|
||||
posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
|
||||
|
||||
print_lines = print_words = print_chars = print_bytes = false;
|
||||
print_linelength = false;
|
||||
total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
|
||||
|
||||
@ -273,6 +273,7 @@ all_tests = \
|
||||
tests/misc/wc.pl \
|
||||
tests/misc/wc-files0-from.pl \
|
||||
tests/misc/wc-files0.sh \
|
||||
tests/misc/wc-nbsp.sh \
|
||||
tests/misc/wc-parallel.sh \
|
||||
tests/misc/wc-proc.sh \
|
||||
tests/misc/cat-proc.sh \
|
||||
|
||||
42
tests/misc/wc-nbsp.sh
Executable file
42
tests/misc/wc-nbsp.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/sh
|
||||
# Test non breaking space handling
|
||||
|
||||
# Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
|
||||
print_ver_ wc printf
|
||||
|
||||
# Before coreutils 8.31 nbsp was treated as part of a word,
|
||||
# rather than a word delimiter
|
||||
|
||||
export LC_ALL=en_US.ISO-8859-1
|
||||
if test "$(locale charmap 2>/dev/null)" = ISO-8859-1; then
|
||||
test $(env printf '=\xA0=' | wc -w) = 2 || fail=1
|
||||
test $(env printf '=\xA0=' | POSIXLY_CORRECT=1 wc -w) = 1 || fail=1
|
||||
fi
|
||||
export LC_ALL=en_US.UTF-8
|
||||
if test "$(locale charmap 2>/dev/null)" = UTF-8; then
|
||||
test $(env printf '=\u00A0=' | wc -w) = 2 || fail=1
|
||||
test $(env printf '=\u2007=' | wc -w) = 2 || fail=1
|
||||
test $(env printf '=\u202F=' | wc -w) = 2 || fail=1
|
||||
test $(env printf '=\u2060=' | wc -w) = 2 || fail=1
|
||||
fi
|
||||
export LC_ALL=ru_RU.KOI8-R
|
||||
if test "$(locale charmap 2>/dev/null)" = KOI8-R; then
|
||||
test $(env printf '=\x9A=' | wc -w) = 2 || fail=1
|
||||
fi
|
||||
|
||||
Exit $fail
|
||||
Loading…
x
Reference in New Issue
Block a user