mirror of
https://github.com/ThomasDickey/mawk-snapshots.git
synced 2026-01-26 11:04:34 +00:00
327 lines
7.2 KiB
C
327 lines
7.2 KiB
C
/********************************************
|
|
split.c
|
|
copyright 2008-2020,2024 Thomas E. Dickey
|
|
copyright 1991-1996,2014 Michael D. Brennan
|
|
|
|
This is a source file for mawk, an implementation of
|
|
the AWK programming language.
|
|
|
|
Mawk is distributed without warranty under the terms of
|
|
the GNU General Public License, version 2, 1991.
|
|
********************************************/
|
|
|
|
/*
|
|
* $MawkId: split.c,v 1.37 2024/12/14 12:53:38 tom Exp $
|
|
*/
|
|
|
|
#define Visible_BI_REC
|
|
#define Visible_CELL
|
|
#define Visible_RE_DATA
|
|
#define Visible_STRING
|
|
|
|
#include <mawk.h>
|
|
#include <split.h>
|
|
#include <symtype.h>
|
|
#include <bi_vars.h>
|
|
#include <bi_funct.h>
|
|
#include <memory.h>
|
|
#include <scan.h>
|
|
#include <regexp.h>
|
|
#include <field.h>
|
|
|
|
#ifndef SP_SIZE
|
|
#ifdef NO_LEAKS
|
|
#define SP_SIZE 4 /* exercises split_block_list code */
|
|
#else
|
|
#define SP_SIZE 2048
|
|
#endif
|
|
#endif
|
|
|
|
typedef struct split_block {
|
|
STRING *strings[SP_SIZE];
|
|
struct split_block *link;
|
|
} Split_Block_Node;
|
|
|
|
static Split_Block_Node split_block_base;
|
|
static Split_Block_Node *split_block_list = &split_block_base;
|
|
|
|
/*
|
|
* Usually the list is of size 1.
|
|
* The list never gets smaller than size 1.
|
|
* This function returns a bigger list to size 1.
|
|
*/
|
|
static void
|
|
spb_list_shrink(void)
|
|
{
|
|
Split_Block_Node *p = split_block_list->link;
|
|
split_block_list->link = NULL;
|
|
while (p) {
|
|
Split_Block_Node *hold = p;
|
|
p = p->link;
|
|
zfree(hold, sizeof(Split_Block_Node));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This function is passed a pointer to the tail of the list,
|
|
* adds a new node and returns the new tail
|
|
* This makes the list one node bigger
|
|
*/
|
|
static Split_Block_Node *
|
|
grow_sp_list(Split_Block_Node * tail)
|
|
{
|
|
tail->link = (Split_Block_Node *) zmalloc(sizeof(Split_Block_Node));
|
|
tail = tail->link;
|
|
tail->link = NULL;
|
|
return tail;
|
|
}
|
|
|
|
/*
|
|
* Split string s of length slen on SPACE without changing s.
|
|
* Load the pieces into STRINGS
|
|
* return the number of pieces
|
|
*/
|
|
size_t
|
|
space_split(const char *s, size_t slen)
|
|
{
|
|
size_t cnt = 0;
|
|
const char *end = s + slen;
|
|
Split_Block_Node *node_p = split_block_list;
|
|
unsigned idx = 0;
|
|
|
|
while (1) {
|
|
/* eat space */
|
|
while (scan_code[*(const unsigned char *) s] == SC_SPACE)
|
|
s++;
|
|
if (s == end)
|
|
return cnt;
|
|
/* find one field */
|
|
{
|
|
const char *q = s++; /* q is front of field */
|
|
while (s < end && scan_code[*(const unsigned char *) s] != SC_SPACE)
|
|
s++;
|
|
/* create and store the string field */
|
|
node_p->strings[idx] = new_STRING1(q, (size_t) (s - q));
|
|
cnt++;
|
|
if (++idx == SP_SIZE) {
|
|
idx = 0;
|
|
node_p = grow_sp_list(node_p);
|
|
}
|
|
}
|
|
}
|
|
/* not reached */
|
|
}
|
|
|
|
/*
|
|
* re is an (RE_DATA *) with compiled PTR holding the compiled RE
|
|
*/
|
|
size_t
|
|
re_split(char *s, size_t slen, PTR re)
|
|
{
|
|
size_t cnt = 0;
|
|
const char *end = s + slen;
|
|
Split_Block_Node *node_p = split_block_list;
|
|
unsigned idx = 0;
|
|
int no_bol = 0;
|
|
|
|
if (slen == 0)
|
|
return 0;
|
|
|
|
while (s < end) {
|
|
size_t mlen;
|
|
char *m = re_pos_match(s, (size_t) (end - s), re, &mlen, no_bol);
|
|
if (m) {
|
|
/* stuff in front of match is a field, might have length zero */
|
|
node_p->strings[idx] = new_STRING1(s, (size_t) (m - s));
|
|
cnt++;
|
|
if (++idx == SP_SIZE) {
|
|
idx = 0;
|
|
node_p = grow_sp_list(node_p);
|
|
}
|
|
s = m + mlen;
|
|
no_bol = 1;
|
|
} else {
|
|
/* no match so last field is what's left */
|
|
node_p->strings[idx] = new_STRING1(s, (size_t) (end - s));
|
|
/* done so don't need to increment idx */
|
|
return ++cnt;
|
|
}
|
|
}
|
|
/* last match at end of s, so last field is "" */
|
|
node_p->strings[idx] = new_STRING0((size_t) 0);
|
|
return ++cnt;
|
|
}
|
|
|
|
/*
|
|
* Matches a string with a regular expression, but only matches of positive
|
|
* length count.
|
|
* input is a string str and its length.
|
|
* return is match point else 0 if no match
|
|
* length of match is returned in *lenp
|
|
*/
|
|
char *
|
|
re_pos_match(char *str, size_t str_len, RE_NODE * re, size_t *lenp, int no_bol)
|
|
{
|
|
const char *end = str + str_len;
|
|
|
|
while (str < end) {
|
|
char *match = REmatch(str, (size_t) (end - str),
|
|
cast_to_re(re), lenp, no_bol);
|
|
if (match) {
|
|
if (*lenp) {
|
|
/* match of positive length so done */
|
|
return match;
|
|
} else {
|
|
/* match but zero length, move str forward and try again */
|
|
/* note this match must have occurred at front of str */
|
|
str = match + 1;
|
|
no_bol = 1;
|
|
}
|
|
} else {
|
|
/* no match */
|
|
*lenp = 0;
|
|
return NULL;
|
|
}
|
|
}
|
|
*lenp = 0;
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* like space split but splits s into single character strings
|
|
*/
|
|
size_t
|
|
null_split(const char *s, size_t slen)
|
|
{
|
|
const char *end = s + slen;
|
|
Split_Block_Node *node_p = split_block_list;
|
|
unsigned idx = 0;
|
|
|
|
while (s < end) {
|
|
node_p->strings[idx] = new_STRING1(s++, (size_t) 1);
|
|
if (++idx == SP_SIZE) {
|
|
idx = 0;
|
|
node_p = grow_sp_list(node_p);
|
|
}
|
|
}
|
|
return slen;
|
|
}
|
|
|
|
/*
|
|
* The caller knows there are cnt STRING* in the split_block_list
|
|
* buffers. This function uses them to make CELLs in cp[]
|
|
* The target CELLs are virgin, they don't need to be
|
|
* destroyed
|
|
*/
|
|
void
|
|
transfer_to_array(CELL cp[], size_t cnt)
|
|
{
|
|
Split_Block_Node *node_p = split_block_list;
|
|
unsigned idx = 0;
|
|
while (cnt > 0) {
|
|
cp->type = C_MBSTRN;
|
|
cp->ptr = (PTR) node_p->strings[idx];
|
|
cnt--;
|
|
cp++;
|
|
if (++idx == SP_SIZE) {
|
|
idx = 0;
|
|
node_p = node_p->link;
|
|
}
|
|
}
|
|
if (node_p != split_block_list)
|
|
spb_list_shrink();
|
|
}
|
|
|
|
/*
|
|
* like above but transfers the saved pieces to $1, $2 ... $cnt
|
|
* The target CELLs may be string type so need to be destroyed
|
|
* The caller has made sure the target CELLs exist
|
|
*/
|
|
void
|
|
transfer_to_fields(size_t cnt)
|
|
{
|
|
CELL *fp = &field[1]; /* start with $1 */
|
|
CELL *fp_end = &field[FBANK_SZ];
|
|
Split_Block_Node *node_p = split_block_list;
|
|
unsigned idx = 0;
|
|
unsigned fb_idx = 0;
|
|
|
|
while (cnt > 0) {
|
|
cell_destroy(fp);
|
|
fp->type = C_MBSTRN;
|
|
fp->ptr = (PTR) node_p->strings[idx];
|
|
cnt--;
|
|
if (++idx == SP_SIZE) {
|
|
idx = 0;
|
|
node_p = node_p->link;
|
|
}
|
|
if (++fp == fp_end) {
|
|
fb_idx++;
|
|
fp = &fbankv[fb_idx][0];
|
|
fp_end = fp + FBANK_SZ;
|
|
}
|
|
}
|
|
if (node_p != split_block_list)
|
|
spb_list_shrink();
|
|
}
|
|
|
|
/*
|
|
* split(s, X, r)
|
|
* split s into array X on r
|
|
*
|
|
* mawk state is EXECUTION sp points at top of eval_stack[]
|
|
*
|
|
* entry: sp[0] holds r
|
|
* sp[-1] pts at X
|
|
* sp[-2] holds s
|
|
*
|
|
* exit : sp is 2 less, sp[0] is C_DOUBLE CELL with value equal
|
|
* to the number of split pieces
|
|
*/
|
|
CELL *
|
|
bi_split(CELL *sp)
|
|
{
|
|
size_t cnt = 0; /* the number of pieces */
|
|
|
|
if (sp->type < C_RE)
|
|
cast_for_split(sp);
|
|
/* can be C_RE, C_SPACE or C_SNULL */
|
|
sp -= 2;
|
|
if (sp->type < C_STRING)
|
|
cast1_to_s(sp);
|
|
|
|
if (string(sp)->len == 0) { /* nothing to split */
|
|
cnt = 0;
|
|
} else {
|
|
switch ((sp + 2)->type) {
|
|
case C_RE:
|
|
if (isEmpty_RE((sp + 2)->ptr)) {
|
|
cnt = null_split(string(sp)->str, string(sp)->len);
|
|
} else {
|
|
cnt = re_split(string(sp)->str, string(sp)->len,
|
|
(sp + 2)->ptr);
|
|
}
|
|
break;
|
|
|
|
case C_SPACE:
|
|
cnt = space_split(string(sp)->str, string(sp)->len);
|
|
break;
|
|
|
|
case C_SNULL: /* split on empty string */
|
|
cnt = null_split(string(sp)->str, string(sp)->len);
|
|
break;
|
|
|
|
default:
|
|
bozo("bad splitting cell in bi_split");
|
|
}
|
|
}
|
|
|
|
free_STRING(string(sp));
|
|
sp->type = C_DOUBLE;
|
|
sp->dval = (double) cnt;
|
|
|
|
array_load((ARRAY) (sp + 1)->ptr, cnt);
|
|
|
|
return sp;
|
|
}
|