mirror of
https://https.git.savannah.gnu.org/git/coreutils.git
synced 2026-01-26 15:29:07 +00:00
cksum: use ARMv8 SIMD extensions
* configure.ac: Add check for ARMv8 VMULL support. * src/cksum.c: Add ARMv8 VMULL detection function. * src/cksum.h: Add ARMv8 VMULL implementation declaration. * src/cksum_vmull.c: ARMv8 VMULL implementation. * src/local.mk: Add build flags for ARMv8 VMULL. * NEWS: Mention the ARMv8 SIMD improvement.
This commit is contained in:
parent
fd01fc8075
commit
d155be4a22
4
NEWS
4
NEWS
@ -70,8 +70,8 @@ GNU coreutils NEWS -*- outline -*-
|
||||
|
||||
** Improvements
|
||||
|
||||
cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions
|
||||
of 40% and 60% respectively.
|
||||
cksum -a crc, makes use of AVX2, AVX512, and ARMv8 SIMD extensions
|
||||
for time reductions of up to 40%, 60%, and 80% respectively.
|
||||
|
||||
'head -c NUM', 'head -n NUM', 'nl -l NUM', 'nproc --ignore NUM',
|
||||
'tail -c NUM', 'tail -n NUM', and 'tail --max-unchanged-stats NUM’
|
||||
|
||||
34
configure.ac
34
configure.ac
@ -618,6 +618,40 @@ if test $utils_cv_brain_16_bit_supported = yes; then
|
||||
AC_DEFINE([BF16_SUPPORTED], [1], [Brain 16 bit float supported])
|
||||
fi
|
||||
|
||||
ac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="-march=armv8-a+crypto $CFLAGS"
|
||||
AC_MSG_CHECKING([if vmull intrinsic exists])
|
||||
AC_CACHE_VAL([utils_cv_vmull_intrinsic_exists],[
|
||||
AC_LINK_IFELSE(
|
||||
[AC_LANG_SOURCE([[
|
||||
#include <stdio.h>
|
||||
#include <sys/auxv.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
uint64x2_t a;
|
||||
poly64_t shift64 = vget_lane_p64(vcreate_p64(0xB8BC6765), 0);
|
||||
a = vreinterpretq_u64_p128(vmull_p64(shift64, vreinterpretq_p128_u64(a)));
|
||||
return (getauxval(AT_HWCAP) & HWCAP_PMULL) > 0;
|
||||
}
|
||||
]])
|
||||
],[
|
||||
utils_cv_vmull_intrinsic_exists=yes
|
||||
],[
|
||||
utils_cv_vmull_intrinsic_exists=no
|
||||
])])
|
||||
AC_MSG_RESULT([$utils_cv_vmull_intrinsic_exists])
|
||||
if test $utils_cv_vmull_intrinsic_exists = yes; then
|
||||
AC_DEFINE([USE_VMULL_CRC32], [1],
|
||||
[CRC32 calculation by vmull hardware instruction enabled])
|
||||
fi
|
||||
AM_CONDITIONAL([USE_VMULL_CRC32],
|
||||
[test $utils_cv_vmull_intrinsic_exists = yes])
|
||||
CFLAGS=$ac_save_CFLAGS
|
||||
|
||||
ac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="-mavx -mpclmul $CFLAGS"
|
||||
AC_MSG_CHECKING([if pclmul intrinsic exists])
|
||||
|
||||
26
src/cksum.c
26
src/cksum.c
@ -40,6 +40,11 @@
|
||||
#include <endian.h>
|
||||
#include "system.h"
|
||||
|
||||
#ifdef USE_VMULL_CRC32
|
||||
# include <sys/auxv.h>
|
||||
# include <asm/hwcap.h>
|
||||
#endif
|
||||
|
||||
#ifdef CRCTAB
|
||||
|
||||
# define BIT(x) ((uint_fast32_t) 1 << (x))
|
||||
@ -201,6 +206,25 @@ avx512_supported (void)
|
||||
return avx512_enabled;
|
||||
}
|
||||
|
||||
static bool
|
||||
vmull_supported (void)
|
||||
{
|
||||
/* vmull for multiplication */
|
||||
bool vmull_enabled = false;
|
||||
# if USE_VMULL_CRC32
|
||||
|
||||
vmull_enabled = (getauxval (AT_HWCAP) & HWCAP_PMULL) > 0;
|
||||
|
||||
if (cksum_debug)
|
||||
error (0, 0, "%s",
|
||||
(vmull_enabled
|
||||
? _("using vmull hardware support")
|
||||
: _("vmull support not detected")));
|
||||
# endif
|
||||
|
||||
return vmull_enabled;
|
||||
}
|
||||
|
||||
static bool
|
||||
cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
|
||||
{
|
||||
@ -273,6 +297,8 @@ crc_sum_stream (FILE *stream, void *resstream, uintmax_t *length)
|
||||
cksum_fp = cksum_avx2;
|
||||
else if (pclmul_supported ())
|
||||
cksum_fp = cksum_pclmul;
|
||||
else if (vmull_supported ())
|
||||
cksum_fp = cksum_vmull;
|
||||
else
|
||||
cksum_fp = cksum_slice8;
|
||||
}
|
||||
|
||||
@ -14,6 +14,9 @@ output_crc (char const *file, int binary_file, void const *digest, bool raw,
|
||||
bool tagged, unsigned char delim, bool args, uintmax_t length)
|
||||
_GL_ATTRIBUTE_NONNULL ((3));
|
||||
|
||||
extern bool
|
||||
cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
|
||||
|
||||
extern bool
|
||||
cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
|
||||
|
||||
|
||||
235
src/cksum_vmull.c
Normal file
235
src/cksum_vmull.c
Normal file
@ -0,0 +1,235 @@
|
||||
/* cksum -- calculate and print POSIX checksums and sizes of files
|
||||
Copyright (C) 1992-2024 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <arm_neon.h>
|
||||
#include "system.h"
|
||||
|
||||
/* Number of bytes to read at once. */
|
||||
#define BUFLEN (1 << 16)
|
||||
|
||||
extern uint_fast32_t const crctab[8][256];
|
||||
|
||||
extern bool
|
||||
cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
|
||||
|
||||
static uint64x2_t
|
||||
bswap_neon (uint64x2_t in)
|
||||
{
|
||||
uint64x2_t a =
|
||||
vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in)));
|
||||
a = vcombine_u64 (vget_high_u64 (a), vget_low_u64 (a));
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */
|
||||
|
||||
bool
|
||||
cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
|
||||
{
|
||||
uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)];
|
||||
uint_fast32_t crc = 0;
|
||||
uintmax_t length = 0;
|
||||
size_t bytes_read;
|
||||
poly64x2_t single_mult_constant;
|
||||
poly64x2_t four_mult_constant;
|
||||
|
||||
if (!fp || !crc_out || !length_out)
|
||||
return false;
|
||||
|
||||
/* These constants and general algorithms are taken from the Intel whitepaper
|
||||
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
||||
*/
|
||||
single_mult_constant =
|
||||
vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C));
|
||||
four_mult_constant =
|
||||
vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C));
|
||||
|
||||
while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
|
||||
{
|
||||
uint64x2_t *datap;
|
||||
uint64x2_t data;
|
||||
uint64x2_t data2;
|
||||
uint64x2_t data3;
|
||||
uint64x2_t data4;
|
||||
uint64x2_t data5;
|
||||
uint64x2_t data6;
|
||||
uint64x2_t data7;
|
||||
uint64x2_t data8;
|
||||
uint64x2_t fold_data;
|
||||
uint64x2_t xor_crc;
|
||||
|
||||
if (length + bytes_read < length)
|
||||
{
|
||||
errno = EOVERFLOW;
|
||||
return false;
|
||||
}
|
||||
length += bytes_read;
|
||||
|
||||
datap = (uint64x2_t *) buf;
|
||||
|
||||
/* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
|
||||
if (bytes_read >= 16 * 8)
|
||||
{
|
||||
data = vld1q_u64 ((uint64_t *) (datap));
|
||||
data = bswap_neon (data);
|
||||
/* XOR in initial CRC value (for us 0 so no effect), or CRC value
|
||||
calculated for previous BUFLEN buffer from fread */
|
||||
xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
|
||||
crc = 0;
|
||||
data = veorq_u64 (data, xor_crc);
|
||||
data3 = vld1q_u64 ((uint64_t *) (datap + 1));
|
||||
data3 = bswap_neon (data3);
|
||||
data5 = vld1q_u64 ((uint64_t *) (datap + 2));
|
||||
data5 = bswap_neon (data5);
|
||||
data7 = vld1q_u64 ((uint64_t *) (datap + 3));
|
||||
data7 = bswap_neon (data7);
|
||||
|
||||
|
||||
while (bytes_read >= 16 * 8)
|
||||
{
|
||||
datap += 4;
|
||||
|
||||
/* Do multiplication here for four consecutive 16 byte blocks */
|
||||
data2 =
|
||||
vreinterpretq_u64_p128 (vmull_p64
|
||||
(vgetq_lane_p64
|
||||
(vreinterpretq_p64_u64 (data), 0),
|
||||
vgetq_lane_p64 (four_mult_constant,
|
||||
0)));
|
||||
data =
|
||||
vreinterpretq_u64_p128 (vmull_high_p64
|
||||
(vreinterpretq_p64_u64 (data),
|
||||
four_mult_constant));
|
||||
data4 =
|
||||
vreinterpretq_u64_p128 (vmull_p64
|
||||
(vgetq_lane_p64
|
||||
(vreinterpretq_p64_u64 (data3), 0),
|
||||
vgetq_lane_p64 (four_mult_constant,
|
||||
0)));
|
||||
data3 =
|
||||
vreinterpretq_u64_p128 (vmull_high_p64
|
||||
(vreinterpretq_p64_u64 (data3),
|
||||
four_mult_constant));
|
||||
data6 =
|
||||
vreinterpretq_u64_p128 (vmull_p64
|
||||
(vgetq_lane_p64
|
||||
(vreinterpretq_p64_u64 (data5), 0),
|
||||
vgetq_lane_p64 (four_mult_constant,
|
||||
0)));
|
||||
data5 =
|
||||
vreinterpretq_u64_p128 (vmull_high_p64
|
||||
(vreinterpretq_p64_u64 (data5),
|
||||
four_mult_constant));
|
||||
data8 =
|
||||
vreinterpretq_u64_p128 (vmull_p64
|
||||
(vgetq_lane_p64
|
||||
(vreinterpretq_p64_u64 (data7), 0),
|
||||
vgetq_lane_p64 (four_mult_constant,
|
||||
0)));
|
||||
data7 =
|
||||
vreinterpretq_u64_p128 (vmull_high_p64
|
||||
(vreinterpretq_p64_u64 (data7),
|
||||
four_mult_constant));
|
||||
|
||||
/* Now multiplication results for the four blocks is xor:ed with
|
||||
next four 16 byte blocks from the buffer. This effectively
|
||||
"consumes" the first four blocks from the buffer.
|
||||
Keep xor result in variables for multiplication in next
|
||||
round of loop. */
|
||||
data = veorq_u64 (data, data2);
|
||||
data2 = vld1q_u64 ((uint64_t *) (datap));
|
||||
data2 = bswap_neon (data2);
|
||||
data = veorq_u64 (data, data2);
|
||||
|
||||
data3 = veorq_u64 (data3, data4);
|
||||
data4 = vld1q_u64 ((uint64_t *) (datap + 1));
|
||||
data4 = bswap_neon (data4);
|
||||
data3 = veorq_u64 (data3, data4);
|
||||
|
||||
data5 = veorq_u64 (data5, data6);
|
||||
data6 = vld1q_u64 ((uint64_t *) (datap + 2));
|
||||
data6 = bswap_neon (data6);
|
||||
data5 = veorq_u64 (data5, data6);
|
||||
|
||||
data7 = veorq_u64 (data7, data8);
|
||||
data8 = vld1q_u64 ((uint64_t *) (datap + 3));
|
||||
data8 = bswap_neon (data8);
|
||||
data7 = veorq_u64 (data7, data8);
|
||||
|
||||
bytes_read -= (16 * 4);
|
||||
}
|
||||
/* At end of loop we write out results from variables back into
|
||||
the buffer, for use in single fold loop */
|
||||
data = bswap_neon (data);
|
||||
vst1q_u64 ((uint64_t *) (datap), data);
|
||||
data3 = bswap_neon (data3);
|
||||
vst1q_u64 ((uint64_t *) (datap + 1), data3);
|
||||
data5 = bswap_neon (data5);
|
||||
vst1q_u64 ((uint64_t *) (datap + 2), data5);
|
||||
data7 = bswap_neon (data7);
|
||||
vst1q_u64 ((uint64_t *) (datap + 3), data7);
|
||||
}
|
||||
|
||||
/* Fold two 16-byte blocks into one 16-byte block */
|
||||
if (bytes_read >= 32)
|
||||
{
|
||||
data = vld1q_u64 ((uint64_t *) (datap));
|
||||
data = bswap_neon (data);
|
||||
xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
|
||||
crc = 0;
|
||||
data = veorq_u64 (data, xor_crc);
|
||||
while (bytes_read >= 32)
|
||||
{
|
||||
datap++;
|
||||
|
||||
data2 =
|
||||
vreinterpretq_u64_p128 (vmull_p64
|
||||
(vgetq_lane_p64
|
||||
(vreinterpretq_p64_u64 (data), 0),
|
||||
vgetq_lane_p64 (single_mult_constant,
|
||||
0)));
|
||||
data =
|
||||
vreinterpretq_u64_p128 (vmull_high_p64
|
||||
(vreinterpretq_p64_u64 (data),
|
||||
single_mult_constant));
|
||||
fold_data = vld1q_u64 ((uint64_t *) (datap));
|
||||
fold_data = bswap_neon (fold_data);
|
||||
data = veorq_u64 (data, data2);
|
||||
data = veorq_u64 (data, fold_data);
|
||||
bytes_read -= 16;
|
||||
}
|
||||
data = bswap_neon (data);
|
||||
vst1q_u64 ((uint64_t *) (datap), data);
|
||||
}
|
||||
|
||||
/* And finish up last 0-31 bytes in a byte by byte fashion */
|
||||
unsigned char *cp = (unsigned char *) datap;
|
||||
while (bytes_read--)
|
||||
crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
|
||||
if (feof (fp))
|
||||
break;
|
||||
}
|
||||
|
||||
*crc_out = crc;
|
||||
*length_out = length;
|
||||
|
||||
return !ferror (fp);
|
||||
}
|
||||
@ -461,6 +461,13 @@ cksum_pclmul_ldadd = src/libcksum_pclmul.a
|
||||
src_cksum_LDADD += $(cksum_pclmul_ldadd)
|
||||
src_libcksum_pclmul_a_CFLAGS = -mavx -mpclmul $(AM_CFLAGS)
|
||||
endif
|
||||
if USE_VMULL_CRC32
|
||||
noinst_LIBRARIES += src/libcksum_vmull.a
|
||||
src_libcksum_vmull_a_SOURCES = src/cksum_vmull.c src/cksum.h
|
||||
cksum_vmull_ldadd = src/libcksum_vmull.a
|
||||
src_cksum_LDADD += $(cksum_vmull_ldadd)
|
||||
src_libcksum_vmull_a_CFLAGS = -march=armv8-a+crypto $(AM_CFLAGS)
|
||||
endif
|
||||
|
||||
src_base64_SOURCES = src/basenc.c
|
||||
src_base64_CPPFLAGS = -DBASE_TYPE=64 $(AM_CPPFLAGS)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user