mawk/split.c
2010-12-10 17:00:00 -05:00

330 lines
6.6 KiB
C

/********************************************
split.c
copyright 2008-2009,2010, Thomas E. Dickey
copyright 1991-1993,1996, Michael D. Brennan
This is a source file for mawk, an implementation of
the AWK programming language.
Mawk is distributed without warranty under the terms of
the GNU General Public License, version 2, 1991.
********************************************/
/*
* $MawkId: split.c,v 1.21 2010/12/10 17:00:00 tom Exp $
* @Log: split.c,v @
* Revision 1.3 1996/02/01 04:39:42 mike
* dynamic array scheme
*
* Revision 1.2 1993/07/15 01:55:03 mike
* rm SIZE_T & indent
*
* Revision 1.1.1.1 1993/07/03 18:58:21 mike
* move source to cvs
*
* Revision 5.4 1993/05/08 18:06:00 mike
* null_split
*
* Revision 5.3 1993/01/01 21:30:48 mike
* split new_STRING() into new_STRING and new_STRING0
*
* Revision 5.2 1992/07/08 21:19:09 brennan
* patch2
* change in split() requires that
* bi_split() call load_array() even
* when cnt is 0.
*
* Revision 5.1 1991/12/05 07:56:31 brennan
* 1.1 pre-release
*
*/
/* split.c */
/* For all splitting up to MAX_SPLIT fields go into
split_buff[], the rest go onto split_ov_list ( split
overflow list)
We can split one of three ways:
(1) By space:
space_split() and space_ov_split()
(2) By regular expression:
re_split() and re_ov_split()
(3) By "" (null -- split into characters)
null_split() and null_ov_split()
*/
#define TEMPBUFF_GOES_HERE
#include "mawk.h"
#include "symtype.h"
#include "bi_vars.h"
#include "bi_funct.h"
#include "memory.h"
#include "scan.h"
#include "regexp.h"
#include "repl.h"
#include "field.h"
SPLIT_OV *split_ov_list;
#define EAT_SPACE() while ( scan_code[*(unsigned char*)s] ==\
SC_SPACE ) s++
#define EAT_NON_SPACE() \
*back = ' ' ; /* sentinel */\
while ( scan_code[*(unsigned char*)s] != SC_SPACE ) s++ ;\
*back = 0
static size_t
space_ov_split(char *s, char *back)
{
SPLIT_OV dummy;
register SPLIT_OV *tail = &dummy;
char *q;
size_t cnt = 0;
size_t len;
while (1) {
EAT_SPACE();
if (*s == 0)
break; /* done */
q = s++;
EAT_NON_SPACE();
tail = tail->link = ZMALLOC(SPLIT_OV);
tail->sval = new_STRING0(len = (size_t) (s - q));
memcpy(tail->sval->str, q, len);
cnt++;
}
tail->link = (SPLIT_OV *) 0;
split_ov_list = dummy.link;
return cnt;
}
/*
* Split string s of length slen on SPACE without changing s.
* Load the pieces into STRINGS and ptrs into split_buff[].
*
* return the number of pieces
*/
size_t
space_split(char *s, size_t slen)
{
char *back = s + slen;
size_t i = 0;
char *q;
int lcnt = MAX_SPLIT / 3;
while (lcnt--) {
EAT_SPACE();
if (*s == 0)
goto done;
/* mark the front with q */
q = s++;
EAT_NON_SPACE();
split_buff[i++] = new_STRING1(q, (size_t) (s - q));
EAT_SPACE();
if (*s == 0)
goto done;
q = s++;
EAT_NON_SPACE();
split_buff[i++] = new_STRING1(q, (size_t) (s - q));
EAT_SPACE();
if (*s == 0)
goto done;
q = s++;
EAT_NON_SPACE();
split_buff[i++] = new_STRING1(q, (size_t) (s - q));
}
/* we've overflowed */
return i + space_ov_split(s, back);
done:
return i;
}
/* match a string with a regular expression, but
* only matches of positive length count
*/
char *
re_pos_match(char *s, size_t str_len, PTR re, size_t *lenp)
{
char *result = 0;
while (str_len && (s = REmatch(s, str_len, cast_to_re(re), lenp))) {
if (*lenp) {
result = s;
break;
} else if (*s == 0) {
break;
} else {
s++;
--str_len;
}
}
return result;
}
/*
* We've overflowed split_buff[], put the rest on the split_ov_list.
*
* Return number of pieces.
*/
static size_t
re_ov_split(char *s, size_t slen, PTR re)
{
SPLIT_OV dummy;
SPLIT_OV *tail = &dummy;
size_t cnt = 1;
char *limit = s + slen;
char *t;
size_t mlen;
while ((s < limit)
&& (t = re_pos_match(s, (size_t) (limit - s), re, &mlen))) {
tail = tail->link = ZMALLOC(SPLIT_OV);
tail->sval = new_STRING1(s, (size_t) (t - s));
s = t + mlen;
cnt++;
}
/* and one more */
tail = tail->link = ZMALLOC(SPLIT_OV);
tail->sval = new_STRING1(s, (size_t) (limit - s));
tail->link = (SPLIT_OV *) 0;
split_ov_list = dummy.link;
return cnt;
}
#define RE_SPLIT3 \
if (!(t = re_pos_match(s, slen, re, &mlen))) \
goto done; \
split_buff[i++] = new_STRING1(s, (size_t) (t - s)); \
s = t + mlen; \
if (s > limit) { \
slen = (size_t) (-1); \
goto done; \
} \
slen = (size_t) (limit - s)
size_t
re_split(STRING * s_param, PTR re)
{
char *limit = s_param->str + s_param->len;
char *s = s_param->str;
char *t;
size_t i = 0;
size_t slen = s_param->len;
size_t mlen;
int lcnt = MAX_SPLIT / 3;
while (lcnt--) {
RE_SPLIT3;
RE_SPLIT3;
RE_SPLIT3;
}
/* we've overflowed */
return i + re_ov_split(s, slen, re);
done:
if ((int) slen >= 0) {
split_buff[i++] = new_STRING1(s, slen);
}
return i;
}
static size_t
null_ov_split(char *s, size_t slen)
{
SPLIT_OV dummy;
SPLIT_OV *ovp = &dummy;
size_t cnt = 0;
while (slen) {
ovp = ovp->link = ZMALLOC(SPLIT_OV);
ovp->sval = new_STRING0((size_t) 1);
ovp->sval->str[0] = *s++;
cnt++;
--slen;
}
ovp->link = (SPLIT_OV *) 0;
split_ov_list = dummy.link;
return cnt;
}
size_t
null_split(char *s, size_t slen)
{
size_t cnt = 0; /* number of fields split */
STRING *sval;
int i = 0; /* indexes split_buff[] */
while (slen) {
if (cnt == MAX_SPLIT) {
cnt += null_ov_split(s, slen);
break;
} else {
sval = new_STRING0((size_t) 1);
sval->str[0] = *s++;
split_buff[i++] = sval;
cnt++;
--slen;
}
}
return cnt;
}
/* split(s, X, r)
* split s into array X on r
*
* entry: sp[0] holds r
* sp[-1] pts at X
* sp[-2] holds s
*/
CELL *
bi_split(CELL * sp)
{
size_t cnt = 0; /* the number of pieces */
if (sp->type < C_RE)
cast_for_split(sp);
/* can be C_RE, C_SPACE or C_SNULL */
sp -= 2;
if (sp->type < C_STRING)
cast1_to_s(sp);
if (string(sp)->len == 0) { /* nothing to split */
cnt = 0;
} else {
switch ((sp + 2)->type) {
case C_RE:
cnt = re_split(string(sp), (sp + 2)->ptr);
break;
case C_SPACE:
cnt = space_split(string(sp)->str, string(sp)->len);
break;
case C_SNULL: /* split on empty string */
cnt = null_split(string(sp)->str, string(sp)->len);
break;
default:
bozo("bad splitting cell in bi_split");
}
}
free_STRING(string(sp));
sp->type = C_DOUBLE;
sp->dval = (double) cnt;
array_load((ARRAY) (sp + 1)->ptr, cnt);
return sp;
}