/******************************************** split.c copyright 2008-2020,2024 Thomas E. Dickey copyright 1991-1996,2014 Michael D. Brennan This is a source file for mawk, an implementation of the AWK programming language. Mawk is distributed without warranty under the terms of the GNU General Public License, version 2, 1991. ********************************************/ /* * $MawkId: split.c,v 1.37 2024/12/14 12:53:38 tom Exp $ */ #define Visible_BI_REC #define Visible_CELL #define Visible_RE_DATA #define Visible_STRING #include #include #include #include #include #include #include #include #include #ifndef SP_SIZE #ifdef NO_LEAKS #define SP_SIZE 4 /* exercises split_block_list code */ #else #define SP_SIZE 2048 #endif #endif typedef struct split_block { STRING *strings[SP_SIZE]; struct split_block *link; } Split_Block_Node; static Split_Block_Node split_block_base; static Split_Block_Node *split_block_list = &split_block_base; /* * Usually the list is of size 1. * The list never gets smaller than size 1. * This function returns a bigger list to size 1. */ static void spb_list_shrink(void) { Split_Block_Node *p = split_block_list->link; split_block_list->link = NULL; while (p) { Split_Block_Node *hold = p; p = p->link; zfree(hold, sizeof(Split_Block_Node)); } } /* * This function is passed a pointer to the tail of the list, * adds a new node and returns the new tail * This makes the list one node bigger */ static Split_Block_Node * grow_sp_list(Split_Block_Node * tail) { tail->link = (Split_Block_Node *) zmalloc(sizeof(Split_Block_Node)); tail = tail->link; tail->link = NULL; return tail; } /* * Split string s of length slen on SPACE without changing s. * Load the pieces into STRINGS * return the number of pieces */ size_t space_split(const char *s, size_t slen) { size_t cnt = 0; const char *end = s + slen; Split_Block_Node *node_p = split_block_list; unsigned idx = 0; while (1) { /* eat space */ while (scan_code[*(const unsigned char *) s] == SC_SPACE) s++; if (s == end) return cnt; /* find one field */ { const char *q = s++; /* q is front of field */ while (s < end && scan_code[*(const unsigned char *) s] != SC_SPACE) s++; /* create and store the string field */ node_p->strings[idx] = new_STRING1(q, (size_t) (s - q)); cnt++; if (++idx == SP_SIZE) { idx = 0; node_p = grow_sp_list(node_p); } } } /* not reached */ } /* * re is an (RE_DATA *) with compiled PTR holding the compiled RE */ size_t re_split(char *s, size_t slen, PTR re) { size_t cnt = 0; const char *end = s + slen; Split_Block_Node *node_p = split_block_list; unsigned idx = 0; int no_bol = 0; if (slen == 0) return 0; while (s < end) { size_t mlen; char *m = re_pos_match(s, (size_t) (end - s), re, &mlen, no_bol); if (m) { /* stuff in front of match is a field, might have length zero */ node_p->strings[idx] = new_STRING1(s, (size_t) (m - s)); cnt++; if (++idx == SP_SIZE) { idx = 0; node_p = grow_sp_list(node_p); } s = m + mlen; no_bol = 1; } else { /* no match so last field is what's left */ node_p->strings[idx] = new_STRING1(s, (size_t) (end - s)); /* done so don't need to increment idx */ return ++cnt; } } /* last match at end of s, so last field is "" */ node_p->strings[idx] = new_STRING0((size_t) 0); return ++cnt; } /* * Matches a string with a regular expression, but only matches of positive * length count. * input is a string str and its length. * return is match point else 0 if no match * length of match is returned in *lenp */ char * re_pos_match(char *str, size_t str_len, RE_NODE * re, size_t *lenp, int no_bol) { const char *end = str + str_len; while (str < end) { char *match = REmatch(str, (size_t) (end - str), cast_to_re(re), lenp, no_bol); if (match) { if (*lenp) { /* match of positive length so done */ return match; } else { /* match but zero length, move str forward and try again */ /* note this match must have occurred at front of str */ str = match + 1; no_bol = 1; } } else { /* no match */ *lenp = 0; return NULL; } } *lenp = 0; return NULL; } /* * like space split but splits s into single character strings */ size_t null_split(const char *s, size_t slen) { const char *end = s + slen; Split_Block_Node *node_p = split_block_list; unsigned idx = 0; while (s < end) { node_p->strings[idx] = new_STRING1(s++, (size_t) 1); if (++idx == SP_SIZE) { idx = 0; node_p = grow_sp_list(node_p); } } return slen; } /* * The caller knows there are cnt STRING* in the split_block_list * buffers. This function uses them to make CELLs in cp[] * The target CELLs are virgin, they don't need to be * destroyed */ void transfer_to_array(CELL cp[], size_t cnt) { Split_Block_Node *node_p = split_block_list; unsigned idx = 0; while (cnt > 0) { cp->type = C_MBSTRN; cp->ptr = (PTR) node_p->strings[idx]; cnt--; cp++; if (++idx == SP_SIZE) { idx = 0; node_p = node_p->link; } } if (node_p != split_block_list) spb_list_shrink(); } /* * like above but transfers the saved pieces to $1, $2 ... $cnt * The target CELLs may be string type so need to be destroyed * The caller has made sure the target CELLs exist */ void transfer_to_fields(size_t cnt) { CELL *fp = &field[1]; /* start with $1 */ CELL *fp_end = &field[FBANK_SZ]; Split_Block_Node *node_p = split_block_list; unsigned idx = 0; unsigned fb_idx = 0; while (cnt > 0) { cell_destroy(fp); fp->type = C_MBSTRN; fp->ptr = (PTR) node_p->strings[idx]; cnt--; if (++idx == SP_SIZE) { idx = 0; node_p = node_p->link; } if (++fp == fp_end) { fb_idx++; fp = &fbankv[fb_idx][0]; fp_end = fp + FBANK_SZ; } } if (node_p != split_block_list) spb_list_shrink(); } /* * split(s, X, r) * split s into array X on r * * mawk state is EXECUTION sp points at top of eval_stack[] * * entry: sp[0] holds r * sp[-1] pts at X * sp[-2] holds s * * exit : sp is 2 less, sp[0] is C_DOUBLE CELL with value equal * to the number of split pieces */ CELL * bi_split(CELL *sp) { size_t cnt = 0; /* the number of pieces */ if (sp->type < C_RE) cast_for_split(sp); /* can be C_RE, C_SPACE or C_SNULL */ sp -= 2; if (sp->type < C_STRING) cast1_to_s(sp); if (string(sp)->len == 0) { /* nothing to split */ cnt = 0; } else { switch ((sp + 2)->type) { case C_RE: if (isEmpty_RE((sp + 2)->ptr)) { cnt = null_split(string(sp)->str, string(sp)->len); } else { cnt = re_split(string(sp)->str, string(sp)->len, (sp + 2)->ptr); } break; case C_SPACE: cnt = space_split(string(sp)->str, string(sp)->len); break; case C_SNULL: /* split on empty string */ cnt = null_split(string(sp)->str, string(sp)->len); break; default: bozo("bad splitting cell in bi_split"); } } free_STRING(string(sp)); sp->type = C_DOUBLE; sp->dval = (double) cnt; array_load((ARRAY) (sp + 1)->ptr, cnt); return sp; }