preproc: implement %hs2b() and %b2hs() functions for compact binary data

Convenience preprocessor functions that allows for efficient packing of binary data in source code. Move some functions that has previously been local but are more generally useful into more accessible places. Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
2026-01-26 16:09:24 +00:00 · 2025-09-15 23:01:59 -07:00 · 2025-09-15 23:01:59 -07:00 · f6166e571a
commit f6166e571a
parent 856ac7b7fb
8 changed files with 183 additions and 36 deletions
--- a/asm/floats.c
+++ b/asm/floats.c
@ -495,17 +495,6 @@ static bool ieee_round(bool minus, fp_limb *mant, int bits)
    return false;
 }

-/* Returns a value >= 16 if not a valid hex digit */
-static unsigned int hexval(char c)
-{
-    unsigned int v = (unsigned char) c;
-
-    if (v >= '0' && v <= '9')
-        return v - '0';
-    else
-        return (v|0x20) - 'a' + 10;
-}
-
 /* Handle floating-point numbers with radix 2^bits and binary exponent */
 static bool ieee_flconvert_bin(const char *string, int bits,
                               fp_limb *mant, int32_t *exponent)
@ -535,7 +524,7 @@ static bool ieee_flconvert_bin(const char *string, int bits,
                nasm_nonfatal("too many periods in floating-point constant");
                return false;
            }
-        } else if ((v = hexval(c)) < (unsigned int)radix) {
+        } else if ((v = nasm_hexval(c)) < (unsigned int)radix) {
            if (!seendigit && v) {
                int l = log2tbl[v];

--- a/asm/preproc.c
+++ b/asm/preproc.c
@ -7688,6 +7688,7 @@ stdmac_strcat(const SMacro *s, Token **params, int nparams)
    int i;
    size_t len = 0;
    char *str, *p;
+    Token *t;

    (void)s;

@ -7696,14 +7697,112 @@ stdmac_strcat(const SMacro *s, Token **params, int nparams)
        len += params[i]->len;
    }

-    nasm_newn(str, len+1);
-    p = str;
+    p = str = nasm_malloc(len+1);

    for (i = 0; i < nparams; i++) {
        p = mempcpy(p, tok_text(params[i]), params[i]->len);
    }
+    *p = '\0';

-    return make_tok_qstr_len(NULL, str, len);
+    t = make_tok_qstr_len(NULL, str, p - str);
+    nasm_free(str);
+    return t;
+}
+
+/* %hs2b() function */
+static Token *
+stdmac_hs2b(const SMacro *s, Token **params, int nparams)
+{
+    int i;
+    size_t len = 0;
+    char *str, *q;
+    Token *t;
+
+    (void)s;
+
+    for (i = 0; i < nparams; i++) {
+        unquote_token(params[i]);
+        len += (params[i]->len + 1) >> 1; /* Maximum possible */
+    }
+
+    q = str = nasm_malloc(len+1);
+
+    for (i = 0; i < nparams; i++) {
+        const char *p = tok_text(params[i]);
+        unsigned int j;
+        unsigned int len = params[i]->len;
+        int v = -1;
+
+        for (j = 0; j < len; j++) {
+            unsigned int hv = nasm_hexval(*p++);
+            if (hv > 15) {
+                /* Separator character or end of string */
+                if (v >= 0)
+                    *q++ = v;
+                v = -1;
+            } else {
+                if (v >= 0) {
+                    *q++ = (v << 4) + hv;
+                    v = -1;
+                } else {
+                    v = hv;
+                }
+            }
+        }
+        /* Partial byte at the end? */
+        if (v >= 0)
+            *q++ = v;
+    }
+    *q = '\0';
+
+    t = make_tok_qstr_len(NULL, str, q - str);
+    nasm_free(str);
+    return t;
+}
+
+/* %b2hs() function */
+static Token *
+stdmac_b2hs(const SMacro *s, Token **params, int nparams)
+{
+    const char * const dchars = nasm_digit_chars(false);
+    const char *p;
+    const char *sep;
+    uint8_t b;
+    char *str, *q;
+    size_t bytes, len, seplen;
+    size_t i;
+    Token *t;
+
+    (void)s;
+    (void)nparams;
+
+    p = unquote_token(params[0]);
+
+    if (!params[0]->len)
+        return make_tok_qstr_len(NULL, "", 0);
+
+    sep    = unquote_token(params[1]);
+    bytes  = params[0]->len;
+    seplen = params[1]->len;
+    len    = (bytes << 1) + (seplen * (bytes-1));
+
+    q = str = nasm_malloc(len+1);
+
+    b = *p++;
+    *q++ = dchars[b >> 4];
+    *q++ = dchars[b & 15];
+    for (i = 1; i < bytes; i++) {
+        if (seplen)
+            q = mempcpy(q, sep, seplen);
+        b = *p++;
+        *q++ = dchars[b >> 4];
+        *q++ = dchars[b & 15];
+    }
+    *q = '\0';
+
+    t = make_tok_qstr_len(NULL, str, q - str);
+    nasm_free(str);
+    return t;
 }

 /* %substr() function */
@ -8223,12 +8322,11 @@ static void pp_add_magic_simple(void)
        { "__?PTR?__",   true, 0, 0, stdmac_ptr },
        { "__?DEFAULT?__", true, 0, 0, stdmac_default },
        { "%abs",        false, 1, SPARM_EVAL, stdmac_abs },
-//        { "%b2hs",       false, 1, SPARM_STR|SPARM_CONDQUOTE, stdmac_b2hs },
        { "%chr",        false, 1, SPARM_EVAL|SPARM_OPTIONAL|SPARM_VARADIC, stdmac_chr },
        { "%count",      false, 1, SPARM_VARADIC, stdmac_count },
        { "%depend",     false, 1, SPARM_PLAIN, stdmac_depend },
        { "%eval",       false, 1, SPARM_EVAL|SPARM_VARADIC, stdmac_join },
-//        { "%hs2b",       false, 1, SPARM_STR|SPARM_CONDQUOTE, stdmac_hs2b },
+        { "%hs2b",       false, 1, SPARM_STR|SPARM_CONDQUOTE|SPARM_VARADIC, stdmac_hs2b },
        { "%map",	 false, 1, SPARM_VARADIC, stdmac_map },
        { "%null",       false, 1, SPARM_GREEDY, stdmac_null },
        { "%pathsearch", false, 1, SPARM_PLAIN, stdmac_pathsearch },
@ -8381,6 +8479,17 @@ static void pp_add_magic_miscfunc(void)
    tmpl.params[2].def    = make_tok_num(NULL, 1);
    define_magic("%ord", false, &tmpl);

+    /* %b2hs() function */
+    nasm_zero(tmpl);
+    tmpl.nparam = 2;
+    tmpl.expand = stdmac_b2hs;
+    tmpl.recursive = true;
+    nasm_newn(tmpl.params, tmpl.nparam);
+    tmpl.params[0].flags  = SPARM_STR|SPARM_CONDQUOTE;
+    tmpl.params[1].flags  = SPARM_STR|SPARM_CONDQUOTE|SPARM_OPTIONAL;
+    tmpl.params[1].def    = make_tok_qstr_len(NULL, "", 0);
+    define_magic("%b2hs", false, &tmpl);
+
    /* %find[i]() functions */
    for (i = 0; i < 2; i++) {
        static const char * const names[] = { "%findi", "%find" };
--- a/doc/changes.src
+++ b/doc/changes.src
@ -17,9 +17,9 @@ It is the production version of NASM since 2025.
 \b Add support for the APX and AVX10 instruction sets, and various
   miscellaneous new instructions.

-\b Add new preprocessor functions: \c{%chr()}, \c{%depend()},
-   \c{%find()}, \c{%findi()}, \c{%null()}, \c{%ord()},
-   \c{%pathsearch()}, and \c{%realpath()}. See \k{ppfunc}.
+\b Add new preprocessor functions: \c{%b2hs()}, \c{%chr()},
+   \c{%depend()}, \c{%find()}, \c{%findi()}, \c{%hs2b()}, \c{%null()},
+   \c{%ord()}, \c{%pathsearch()}, and \c{%realpath()}. See \k{ppfunc}.

 \b New preprocessor directive \c{%note} to insert a note in the list
   file, without issuing an external diagnosic.  Unlike a comment, it
--- a/doc/preproc.src
+++ b/doc/preproc.src
@ -714,6 +714,15 @@ single token containing a decimal number; no minus sign will be
 emitted even if the input value is the maximum negative number.


+\S{f_b2hs} \i\c{%b2hs()} Function
+
+The \c{%b2hs()} functin takes a quoted string and an optional
+separator string, and expands to a quoted string containing a packed
+hexadecimal form of the bytes of the first string, separated by the
+separator string if applicable. This is the inverse of the \c{%hs2b()}
+function, see \k{f_hs2b}.
+
+
 \S{f_chr} \i\c{%chr()} Function

 The \c{%chr()} function evaluates its arguments as integers, then
@ -759,6 +768,7 @@ This is the function equivalent of the \c{%depend} directive, see

 See also the \c{%pathsearch()} function (\k{f_pathsearch}).

+
 \S{f_eval} \i\c{%eval()} Function

 The \c{%eval()} function evaluates its argument as a numeric
@ -805,6 +815,23 @@ Equivalent to \i\c\{%eval()}, except that the results generated are
 given as unsigned hexadecimal, with a \c{0x} prefix.


+\S{f_hs2b} \i\c\{%hs2b()} Function
+
+The \c{%hs2b()} function takes one or more quoted strings containing
+hexadecimal numbers and optional separators (any character that is not
+a valid hexadecimal digit is considered a separator) and expands to a
+quoted string containing the bytes encoded in the hexadecimal
+string. Every pair of hexadecimal digits encodes a byte, but a
+separator will always terminate the encoding of a byte. Thus, these
+two statements will produce the same output:
+
+\c      db 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09
+\c      db %hs2b("00010203 4 0506 07 8","9")
+
+This can be used to compactly encode long strings of binary data in
+source code.
+
+
 \S{f_is} \i\c{%is()}\I\c{%isn()} Family Functions

 Each \c{%if} conditional assembly family directive (see \k{condasm})
--- a/include/nasmlib.h
+++ b/include/nasmlib.h
@ -292,6 +292,12 @@ int64_t readstrnum(char *str, int length, bool *warn);
 int numstr(char *buf, size_t buflen, uint64_t n,
           int digits, unsigned int base, bool ucase);

+extern const char * const nasmlib_digit_chars[2];
+static inline const char *nasm_digit_chars(bool ucase)
+{
+    return nasmlib_digit_chars[ucase];
+}
+
 /*
 * seg_alloc: allocate a hitherto unused segment number.
 */
--- a/include/nctype.h
+++ b/include/nctype.h
@ -124,4 +124,15 @@ static inline void nasm_ctype_tasm_mode(void)
    /* No differences at the present moment */
 }

+/* Returns a value >= 16 if not a valid hex digit */
+static inline unsigned int nasm_hexval(char c)
+{
+    unsigned int v = (unsigned char) c;
+
+    if (v >= '0' && v <= '9')
+        return v - '0';
+    else
+        return (v|0x20) - 'a' + 10;
+}
+
 #endif /* NASM_NCTYPE_H */
--- a/nasmlib/numstr.c
+++ b/nasmlib/numstr.c
@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------- *
 *
- *   Copyright 2023 The NASM Authors - All Rights Reserved
+ *   Copyright 2023-2025 The NASM Authors - All Rights Reserved
 *   See the file AUTHORS included with the NASM distribution for
 *   the specific copyright holders.
 *
@ -33,6 +33,20 @@

 #include "nasmlib.h"

+const char * const nasmlib_digit_chars[2] = {
+    /* Lower case version */
+    "0123456789"
+    "abcdefghijklmnopqrstuvwxyz"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "@_",
+
+    /* Upper case version */
+    "0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz"
+    "@_"
+};
+
 /*
 * Produce an unsigned integer string from a number with a specified
 * base, digits and signedness.
@ -40,21 +54,7 @@
 int numstr(char *buf, size_t buflen, uint64_t n,
           int digits, unsigned int base, bool ucase)
 {
-    static const char digit_chars[2][NUMSTR_MAXBASE+1] =
-    {
-        /* Lower case version */
-        "0123456789"
-        "abcdefghijklmnopqrstuvwxyz"
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-        "@_",
-
-        /* Upper case version */
-        "0123456789"
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-        "abcdefghijklmnopqrstuvwxyz"
-        "@_"
-    };
-    const char * const dchars = digit_chars[ucase];
+    const char * const dchars = nasm_digit_chars(ucase);
    bool moredigits = digits <= 0;
    char *p;
    int len;
--- a/test/chrord.asm
+++ b/test/chrord.asm
@ -2,3 +2,8 @@
 	db %ord("Hello, World!")
 	db %ord("Hello, World!",1,-1)
 	db %chr()
+	db %b2hs("Hello, World!")
+	db %b2hs("Hello, World!",':')
+	db %hs2b("303132 33 34 35 3 6 3 78 9", "abcd")
+	db 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09
+	db %hs2b("00010203 4 0506 07 8","9")