mawk/split.c
2024-12-14 22:54:53 +00:00

327 lines
7.2 KiB
C

/********************************************
split.c
copyright 2008-2020,2024 Thomas E. Dickey
copyright 1991-1996,2014 Michael D. Brennan
This is a source file for mawk, an implementation of
the AWK programming language.
Mawk is distributed without warranty under the terms of
the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: split.c,v 1.37 2024/12/14 12:53:38 tom Exp $
*/
#define Visible_BI_REC
#define Visible_CELL
#define Visible_RE_DATA
#define Visible_STRING
#include <mawk.h>
#include <split.h>
#include <symtype.h>
#include <bi_vars.h>
#include <bi_funct.h>
#include <memory.h>
#include <scan.h>
#include <regexp.h>
#include <field.h>
#ifndef SP_SIZE
#ifdef NO_LEAKS
#define SP_SIZE 4 /* exercises split_block_list code */
#else
#define SP_SIZE 2048
#endif
#endif
typedef struct split_block {
STRING *strings[SP_SIZE];
struct split_block *link;
} Split_Block_Node;
static Split_Block_Node split_block_base;
static Split_Block_Node *split_block_list = &split_block_base;
/*
* Usually the list is of size 1.
* The list never gets smaller than size 1.
* This function returns a bigger list to size 1.
*/
static void
spb_list_shrink(void)
{
Split_Block_Node *p = split_block_list->link;
split_block_list->link = NULL;
while (p) {
Split_Block_Node *hold = p;
p = p->link;
zfree(hold, sizeof(Split_Block_Node));
}
}
/*
* This function is passed a pointer to the tail of the list,
* adds a new node and returns the new tail
* This makes the list one node bigger
*/
static Split_Block_Node *
grow_sp_list(Split_Block_Node * tail)
{
tail->link = (Split_Block_Node *) zmalloc(sizeof(Split_Block_Node));
tail = tail->link;
tail->link = NULL;
return tail;
}
/*
* Split string s of length slen on SPACE without changing s.
* Load the pieces into STRINGS
* return the number of pieces
*/
size_t
space_split(const char *s, size_t slen)
{
size_t cnt = 0;
const char *end = s + slen;
Split_Block_Node *node_p = split_block_list;
unsigned idx = 0;
while (1) {
/* eat space */
while (scan_code[*(const unsigned char *) s] == SC_SPACE)
s++;
if (s == end)
return cnt;
/* find one field */
{
const char *q = s++; /* q is front of field */
while (s < end && scan_code[*(const unsigned char *) s] != SC_SPACE)
s++;
/* create and store the string field */
node_p->strings[idx] = new_STRING1(q, (size_t) (s - q));
cnt++;
if (++idx == SP_SIZE) {
idx = 0;
node_p = grow_sp_list(node_p);
}
}
}
/* not reached */
}
/*
* re is an (RE_DATA *) with compiled PTR holding the compiled RE
*/
size_t
re_split(char *s, size_t slen, PTR re)
{
size_t cnt = 0;
const char *end = s + slen;
Split_Block_Node *node_p = split_block_list;
unsigned idx = 0;
int no_bol = 0;
if (slen == 0)
return 0;
while (s < end) {
size_t mlen;
char *m = re_pos_match(s, (size_t) (end - s), re, &mlen, no_bol);
if (m) {
/* stuff in front of match is a field, might have length zero */
node_p->strings[idx] = new_STRING1(s, (size_t) (m - s));
cnt++;
if (++idx == SP_SIZE) {
idx = 0;
node_p = grow_sp_list(node_p);
}
s = m + mlen;
no_bol = 1;
} else {
/* no match so last field is what's left */
node_p->strings[idx] = new_STRING1(s, (size_t) (end - s));
/* done so don't need to increment idx */
return ++cnt;
}
}
/* last match at end of s, so last field is "" */
node_p->strings[idx] = new_STRING0((size_t) 0);
return ++cnt;
}
/*
* Matches a string with a regular expression, but only matches of positive
* length count.
* input is a string str and its length.
* return is match point else 0 if no match
* length of match is returned in *lenp
*/
char *
re_pos_match(char *str, size_t str_len, RE_NODE * re, size_t *lenp, int no_bol)
{
const char *end = str + str_len;
while (str < end) {
char *match = REmatch(str, (size_t) (end - str),
cast_to_re(re), lenp, no_bol);
if (match) {
if (*lenp) {
/* match of positive length so done */
return match;
} else {
/* match but zero length, move str forward and try again */
/* note this match must have occurred at front of str */
str = match + 1;
no_bol = 1;
}
} else {
/* no match */
*lenp = 0;
return NULL;
}
}
*lenp = 0;
return NULL;
}
/*
* like space split but splits s into single character strings
*/
size_t
null_split(const char *s, size_t slen)
{
const char *end = s + slen;
Split_Block_Node *node_p = split_block_list;
unsigned idx = 0;
while (s < end) {
node_p->strings[idx] = new_STRING1(s++, (size_t) 1);
if (++idx == SP_SIZE) {
idx = 0;
node_p = grow_sp_list(node_p);
}
}
return slen;
}
/*
* The caller knows there are cnt STRING* in the split_block_list
* buffers. This function uses them to make CELLs in cp[]
* The target CELLs are virgin, they don't need to be
* destroyed
*/
void
transfer_to_array(CELL cp[], size_t cnt)
{
Split_Block_Node *node_p = split_block_list;
unsigned idx = 0;
while (cnt > 0) {
cp->type = C_MBSTRN;
cp->ptr = (PTR) node_p->strings[idx];
cnt--;
cp++;
if (++idx == SP_SIZE) {
idx = 0;
node_p = node_p->link;
}
}
if (node_p != split_block_list)
spb_list_shrink();
}
/*
* like above but transfers the saved pieces to $1, $2 ... $cnt
* The target CELLs may be string type so need to be destroyed
* The caller has made sure the target CELLs exist
*/
void
transfer_to_fields(size_t cnt)
{
CELL *fp = &field[1]; /* start with $1 */
CELL *fp_end = &field[FBANK_SZ];
Split_Block_Node *node_p = split_block_list;
unsigned idx = 0;
unsigned fb_idx = 0;
while (cnt > 0) {
cell_destroy(fp);
fp->type = C_MBSTRN;
fp->ptr = (PTR) node_p->strings[idx];
cnt--;
if (++idx == SP_SIZE) {
idx = 0;
node_p = node_p->link;
}
if (++fp == fp_end) {
fb_idx++;
fp = &fbankv[fb_idx][0];
fp_end = fp + FBANK_SZ;
}
}
if (node_p != split_block_list)
spb_list_shrink();
}
/*
* split(s, X, r)
* split s into array X on r
*
* mawk state is EXECUTION sp points at top of eval_stack[]
*
* entry: sp[0] holds r
* sp[-1] pts at X
* sp[-2] holds s
*
* exit : sp is 2 less, sp[0] is C_DOUBLE CELL with value equal
* to the number of split pieces
*/
CELL *
bi_split(CELL *sp)
{
size_t cnt = 0; /* the number of pieces */
if (sp->type < C_RE)
cast_for_split(sp);
/* can be C_RE, C_SPACE or C_SNULL */
sp -= 2;
if (sp->type < C_STRING)
cast1_to_s(sp);
if (string(sp)->len == 0) { /* nothing to split */
cnt = 0;
} else {
switch ((sp + 2)->type) {
case C_RE:
if (isEmpty_RE((sp + 2)->ptr)) {
cnt = null_split(string(sp)->str, string(sp)->len);
} else {
cnt = re_split(string(sp)->str, string(sp)->len,
(sp + 2)->ptr);
}
break;
case C_SPACE:
cnt = space_split(string(sp)->str, string(sp)->len);
break;
case C_SNULL: /* split on empty string */
cnt = null_split(string(sp)->str, string(sp)->len);
break;
default:
bozo("bad splitting cell in bi_split");
}
}
free_STRING(string(sp));
sp->type = C_DOUBLE;
sp->dval = (double) cnt;
array_load((ARRAY) (sp + 1)->ptr, cnt);
return sp;
}