cksum: use ARMv8 SIMD extensions

* configure.ac: Add check for ARMv8 VMULL support.
* src/cksum.c: Add ARMv8 VMULL detection function.
* src/cksum.h: Add ARMv8 VMULL implementation declaration.
* src/cksum_vmull.c: ARMv8 VMULL implementation.
* src/local.mk: Add build flags for ARMv8 VMULL.
* NEWS: Mention the ARMv8 SIMD improvement.
This commit is contained in:
Sam Russell 2024-11-28 20:28:21 +01:00 committed by Pádraig Brady
parent fd01fc8075
commit d155be4a22
6 changed files with 307 additions and 2 deletions

4
NEWS
View File

@ -70,8 +70,8 @@ GNU coreutils NEWS -*- outline -*-
** Improvements
cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions
of 40% and 60% respectively.
cksum -a crc, makes use of AVX2, AVX512, and ARMv8 SIMD extensions
for time reductions of up to 40%, 60%, and 80% respectively.
'head -c NUM', 'head -n NUM', 'nl -l NUM', 'nproc --ignore NUM',
'tail -c NUM', 'tail -n NUM', and 'tail --max-unchanged-stats NUM

View File

@ -618,6 +618,40 @@ if test $utils_cv_brain_16_bit_supported = yes; then
AC_DEFINE([BF16_SUPPORTED], [1], [Brain 16 bit float supported])
fi
ac_save_CFLAGS=$CFLAGS
CFLAGS="-march=armv8-a+crypto $CFLAGS"
AC_MSG_CHECKING([if vmull intrinsic exists])
AC_CACHE_VAL([utils_cv_vmull_intrinsic_exists],[
AC_LINK_IFELSE(
[AC_LANG_SOURCE([[
#include <stdio.h>
#include <sys/auxv.h>
#include <asm/hwcap.h>
#include <arm_neon.h>
int
main (void)
{
uint64x2_t a;
poly64_t shift64 = vget_lane_p64(vcreate_p64(0xB8BC6765), 0);
a = vreinterpretq_u64_p128(vmull_p64(shift64, vreinterpretq_p128_u64(a)));
return (getauxval(AT_HWCAP) & HWCAP_PMULL) > 0;
}
]])
],[
utils_cv_vmull_intrinsic_exists=yes
],[
utils_cv_vmull_intrinsic_exists=no
])])
AC_MSG_RESULT([$utils_cv_vmull_intrinsic_exists])
if test $utils_cv_vmull_intrinsic_exists = yes; then
AC_DEFINE([USE_VMULL_CRC32], [1],
[CRC32 calculation by vmull hardware instruction enabled])
fi
AM_CONDITIONAL([USE_VMULL_CRC32],
[test $utils_cv_vmull_intrinsic_exists = yes])
CFLAGS=$ac_save_CFLAGS
ac_save_CFLAGS=$CFLAGS
CFLAGS="-mavx -mpclmul $CFLAGS"
AC_MSG_CHECKING([if pclmul intrinsic exists])

View File

@ -40,6 +40,11 @@
#include <endian.h>
#include "system.h"
#ifdef USE_VMULL_CRC32
# include <sys/auxv.h>
# include <asm/hwcap.h>
#endif
#ifdef CRCTAB
# define BIT(x) ((uint_fast32_t) 1 << (x))
@ -201,6 +206,25 @@ avx512_supported (void)
return avx512_enabled;
}
static bool
vmull_supported (void)
{
/* vmull for multiplication */
bool vmull_enabled = false;
# if USE_VMULL_CRC32
vmull_enabled = (getauxval (AT_HWCAP) & HWCAP_PMULL) > 0;
if (cksum_debug)
error (0, 0, "%s",
(vmull_enabled
? _("using vmull hardware support")
: _("vmull support not detected")));
# endif
return vmull_enabled;
}
static bool
cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
{
@ -273,6 +297,8 @@ crc_sum_stream (FILE *stream, void *resstream, uintmax_t *length)
cksum_fp = cksum_avx2;
else if (pclmul_supported ())
cksum_fp = cksum_pclmul;
else if (vmull_supported ())
cksum_fp = cksum_vmull;
else
cksum_fp = cksum_slice8;
}

View File

@ -14,6 +14,9 @@ output_crc (char const *file, int binary_file, void const *digest, bool raw,
bool tagged, unsigned char delim, bool args, uintmax_t length)
_GL_ATTRIBUTE_NONNULL ((3));
extern bool
cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
extern bool
cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);

235
src/cksum_vmull.c Normal file
View File

@ -0,0 +1,235 @@
/* cksum -- calculate and print POSIX checksums and sizes of files
Copyright (C) 1992-2024 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
#include <stdio.h>
#include <sys/types.h>
#include <stdint.h>
#include <arm_neon.h>
#include "system.h"
/* Number of bytes to read at once. */
#define BUFLEN (1 << 16)
extern uint_fast32_t const crctab[8][256];
extern bool
cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
static uint64x2_t
bswap_neon (uint64x2_t in)
{
uint64x2_t a =
vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in)));
a = vcombine_u64 (vget_high_u64 (a), vget_low_u64 (a));
return a;
}
/* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */
bool
cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
{
uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)];
uint_fast32_t crc = 0;
uintmax_t length = 0;
size_t bytes_read;
poly64x2_t single_mult_constant;
poly64x2_t four_mult_constant;
if (!fp || !crc_out || !length_out)
return false;
/* These constants and general algorithms are taken from the Intel whitepaper
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
*/
single_mult_constant =
vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C));
four_mult_constant =
vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C));
while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
{
uint64x2_t *datap;
uint64x2_t data;
uint64x2_t data2;
uint64x2_t data3;
uint64x2_t data4;
uint64x2_t data5;
uint64x2_t data6;
uint64x2_t data7;
uint64x2_t data8;
uint64x2_t fold_data;
uint64x2_t xor_crc;
if (length + bytes_read < length)
{
errno = EOVERFLOW;
return false;
}
length += bytes_read;
datap = (uint64x2_t *) buf;
/* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
if (bytes_read >= 16 * 8)
{
data = vld1q_u64 ((uint64_t *) (datap));
data = bswap_neon (data);
/* XOR in initial CRC value (for us 0 so no effect), or CRC value
calculated for previous BUFLEN buffer from fread */
xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
crc = 0;
data = veorq_u64 (data, xor_crc);
data3 = vld1q_u64 ((uint64_t *) (datap + 1));
data3 = bswap_neon (data3);
data5 = vld1q_u64 ((uint64_t *) (datap + 2));
data5 = bswap_neon (data5);
data7 = vld1q_u64 ((uint64_t *) (datap + 3));
data7 = bswap_neon (data7);
while (bytes_read >= 16 * 8)
{
datap += 4;
/* Do multiplication here for four consecutive 16 byte blocks */
data2 =
vreinterpretq_u64_p128 (vmull_p64
(vgetq_lane_p64
(vreinterpretq_p64_u64 (data), 0),
vgetq_lane_p64 (four_mult_constant,
0)));
data =
vreinterpretq_u64_p128 (vmull_high_p64
(vreinterpretq_p64_u64 (data),
four_mult_constant));
data4 =
vreinterpretq_u64_p128 (vmull_p64
(vgetq_lane_p64
(vreinterpretq_p64_u64 (data3), 0),
vgetq_lane_p64 (four_mult_constant,
0)));
data3 =
vreinterpretq_u64_p128 (vmull_high_p64
(vreinterpretq_p64_u64 (data3),
four_mult_constant));
data6 =
vreinterpretq_u64_p128 (vmull_p64
(vgetq_lane_p64
(vreinterpretq_p64_u64 (data5), 0),
vgetq_lane_p64 (four_mult_constant,
0)));
data5 =
vreinterpretq_u64_p128 (vmull_high_p64
(vreinterpretq_p64_u64 (data5),
four_mult_constant));
data8 =
vreinterpretq_u64_p128 (vmull_p64
(vgetq_lane_p64
(vreinterpretq_p64_u64 (data7), 0),
vgetq_lane_p64 (four_mult_constant,
0)));
data7 =
vreinterpretq_u64_p128 (vmull_high_p64
(vreinterpretq_p64_u64 (data7),
four_mult_constant));
/* Now multiplication results for the four blocks is xor:ed with
next four 16 byte blocks from the buffer. This effectively
"consumes" the first four blocks from the buffer.
Keep xor result in variables for multiplication in next
round of loop. */
data = veorq_u64 (data, data2);
data2 = vld1q_u64 ((uint64_t *) (datap));
data2 = bswap_neon (data2);
data = veorq_u64 (data, data2);
data3 = veorq_u64 (data3, data4);
data4 = vld1q_u64 ((uint64_t *) (datap + 1));
data4 = bswap_neon (data4);
data3 = veorq_u64 (data3, data4);
data5 = veorq_u64 (data5, data6);
data6 = vld1q_u64 ((uint64_t *) (datap + 2));
data6 = bswap_neon (data6);
data5 = veorq_u64 (data5, data6);
data7 = veorq_u64 (data7, data8);
data8 = vld1q_u64 ((uint64_t *) (datap + 3));
data8 = bswap_neon (data8);
data7 = veorq_u64 (data7, data8);
bytes_read -= (16 * 4);
}
/* At end of loop we write out results from variables back into
the buffer, for use in single fold loop */
data = bswap_neon (data);
vst1q_u64 ((uint64_t *) (datap), data);
data3 = bswap_neon (data3);
vst1q_u64 ((uint64_t *) (datap + 1), data3);
data5 = bswap_neon (data5);
vst1q_u64 ((uint64_t *) (datap + 2), data5);
data7 = bswap_neon (data7);
vst1q_u64 ((uint64_t *) (datap + 3), data7);
}
/* Fold two 16-byte blocks into one 16-byte block */
if (bytes_read >= 32)
{
data = vld1q_u64 ((uint64_t *) (datap));
data = bswap_neon (data);
xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
crc = 0;
data = veorq_u64 (data, xor_crc);
while (bytes_read >= 32)
{
datap++;
data2 =
vreinterpretq_u64_p128 (vmull_p64
(vgetq_lane_p64
(vreinterpretq_p64_u64 (data), 0),
vgetq_lane_p64 (single_mult_constant,
0)));
data =
vreinterpretq_u64_p128 (vmull_high_p64
(vreinterpretq_p64_u64 (data),
single_mult_constant));
fold_data = vld1q_u64 ((uint64_t *) (datap));
fold_data = bswap_neon (fold_data);
data = veorq_u64 (data, data2);
data = veorq_u64 (data, fold_data);
bytes_read -= 16;
}
data = bswap_neon (data);
vst1q_u64 ((uint64_t *) (datap), data);
}
/* And finish up last 0-31 bytes in a byte by byte fashion */
unsigned char *cp = (unsigned char *) datap;
while (bytes_read--)
crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
if (feof (fp))
break;
}
*crc_out = crc;
*length_out = length;
return !ferror (fp);
}

View File

@ -461,6 +461,13 @@ cksum_pclmul_ldadd = src/libcksum_pclmul.a
src_cksum_LDADD += $(cksum_pclmul_ldadd)
src_libcksum_pclmul_a_CFLAGS = -mavx -mpclmul $(AM_CFLAGS)
endif
if USE_VMULL_CRC32
noinst_LIBRARIES += src/libcksum_vmull.a
src_libcksum_vmull_a_SOURCES = src/cksum_vmull.c src/cksum.h
cksum_vmull_ldadd = src/libcksum_vmull.a
src_cksum_LDADD += $(cksum_vmull_ldadd)
src_libcksum_vmull_a_CFLAGS = -march=armv8-a+crypto $(AM_CFLAGS)
endif
src_base64_SOURCES = src/basenc.c
src_base64_CPPFLAGS = -DBASE_TYPE=64 $(AM_CPPFLAGS)