mirror of
https://github.com/Perl/perl5.git
synced 2026-01-26 08:38:23 +00:00
SV bodies are Zero()ed when allocated/uprooted from an arena for use. This commit changes instances where a fresh body field is unnecessarily assigned a zero/NULL value into an assertion that the field already contains the desired value.
1544 lines
56 KiB
C
1544 lines
56 KiB
C
#ifdef PERL_EXT_RE_BUILD
|
|
#include "re_top.h"
|
|
#endif
|
|
|
|
#include "EXTERN.h"
|
|
#define PERL_IN_REGEX_ENGINE
|
|
#define PERL_IN_REGCOMP_ANY
|
|
#define PERL_IN_REGCOMP_INVLIST_C
|
|
#include "perl.h"
|
|
|
|
#ifdef PERL_IN_XSUB_RE
|
|
# include "re_comp.h"
|
|
#else
|
|
# include "regcomp.h"
|
|
#endif
|
|
|
|
#include "invlist_inline.h"
|
|
#include "unicode_constants.h"
|
|
#include "regcomp_internal.h"
|
|
|
|
#ifdef PERL_RE_BUILD_AUX
|
|
void
|
|
Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
|
|
{
|
|
PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
|
|
|
|
/* As the name says. The zeroth bit corresponds to the code point given by
|
|
* 'offset' */
|
|
|
|
UV start, end;
|
|
|
|
Zero(bitmap, len, U8);
|
|
|
|
invlist_iterinit(invlist);
|
|
while (invlist_iternext(invlist, &start, &end)) {
|
|
assert(start >= offset);
|
|
|
|
for (UV i = start; i <= end; i++) {
|
|
UV adjusted = i - offset;
|
|
|
|
BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
|
|
}
|
|
}
|
|
invlist_iterfinish(invlist);
|
|
}
|
|
|
|
void
|
|
Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
|
|
{
|
|
PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
|
|
|
|
/* As the name says. The zeroth bit corresponds to the code point given by
|
|
* 'offset' */
|
|
|
|
Size_t i;
|
|
|
|
for (i = 0; i < bitmap_len; i++) {
|
|
if (BITMAP_TEST(bitmap, i)) {
|
|
int start = i++;
|
|
|
|
/* Save a little work by adding a range all at once instead of bit
|
|
* by bit */
|
|
while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
|
|
i++;
|
|
}
|
|
|
|
*invlist = add_range_to_invlist_(*invlist,
|
|
start + offset,
|
|
i + offset - 1);
|
|
}
|
|
}
|
|
}
|
|
#endif /* PERL_RE_BUILD_AUX */
|
|
|
|
/* This section of code defines the inversion list object and its methods. The
|
|
* interfaces are highly subject to change, so as much as possible is static to
|
|
* this file. An inversion list is here implemented as a malloc'd C UV array
|
|
* as an SVt_INVLIST scalar.
|
|
*
|
|
* An inversion list for Unicode is an array of code points, sorted by ordinal
|
|
* number. Each element gives the code point that begins a range that extends
|
|
* up-to but not including the code point given by the next element. The final
|
|
* element gives the first code point of a range that extends to the platform's
|
|
* infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
|
|
* ...) give ranges whose code points are all in the inversion list. We say
|
|
* that those ranges are in the set. The odd-numbered elements give ranges
|
|
* whose code points are not in the inversion list, and hence not in the set.
|
|
* Thus, element [0] is the first code point in the list. Element [1]
|
|
* is the first code point beyond that not in the list; and element [2] is the
|
|
* first code point beyond that that is in the list. In other words, the first
|
|
* range is invlist[0]..(invlist[1]-1), and all code points in that range are
|
|
* in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
|
|
* all code points in that range are not in the inversion list. The third
|
|
* range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
|
|
* list, and so forth. Thus every element whose index is divisible by two
|
|
* gives the beginning of a range that is in the list, and every element whose
|
|
* index is not divisible by two gives the beginning of a range not in the
|
|
* list. If the final element's index is divisible by two, the inversion list
|
|
* extends to the platform's infinity; otherwise the highest code point in the
|
|
* inversion list is the contents of that element minus 1.
|
|
*
|
|
* A range that contains just a single code point N will look like
|
|
* invlist[i] == N
|
|
* invlist[i+1] == N+1
|
|
*
|
|
* If N is UV_MAX (the highest representable code point on the machine), N+1 is
|
|
* impossible to represent, so element [i+1] is omitted. The single element
|
|
* inversion list
|
|
* invlist[0] == UV_MAX
|
|
* contains just UV_MAX, but is interpreted as matching to infinity.
|
|
*
|
|
* Taking the complement (inverting) an inversion list is quite simple, if the
|
|
* first element is 0, remove it; otherwise add a 0 element at the beginning.
|
|
* This implementation reserves an element at the beginning of each inversion
|
|
* list to always contain 0; there is an additional flag in the header which
|
|
* indicates if the list begins at the 0, or is offset to begin at the next
|
|
* element. This means that the inversion list can be inverted without any
|
|
* copying; just flip the flag.
|
|
*
|
|
* More about inversion lists can be found in "Unicode Demystified"
|
|
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
|
|
*
|
|
* The inversion list data structure is currently implemented as an SV pointing
|
|
* to an array of UVs that the SV thinks are bytes. This allows us to have an
|
|
* array of UV whose memory management is automatically handled by the existing
|
|
* facilities for SV's.
|
|
*
|
|
* Some of the methods should always be private to the implementation, and some
|
|
* should eventually be made public */
|
|
|
|
/* The header definitions are in F<invlist_inline.h> */
|
|
|
|
#ifndef PERL_IN_XSUB_RE
|
|
|
|
PERL_STATIC_INLINE UV*
|
|
S_invlist_array_init_(SV* const invlist, const bool will_have_0)
|
|
{
|
|
/* Returns a pointer to the first element in the inversion list's array.
|
|
* This is called upon initialization of an inversion list. Where the
|
|
* array begins depends on whether the list has the code point U+0000 in it
|
|
* or not. The other parameter tells it whether the code that follows this
|
|
* call is about to put a 0 in the inversion list or not. The first
|
|
* element is either the element reserved for 0, if true, or the element
|
|
* after it, if false */
|
|
|
|
bool* offset = get_invlist_offset_addr(invlist);
|
|
UV* zero_addr = (UV *) SvPVX(invlist);
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_ARRAY_INIT_;
|
|
|
|
/* Must be empty */
|
|
assert(! invlist_len_(invlist));
|
|
|
|
*zero_addr = 0;
|
|
|
|
/* 1^1 = 0; 1^0 = 1 */
|
|
*offset = 1 ^ will_have_0;
|
|
return zero_addr + *offset;
|
|
}
|
|
|
|
STATIC void
|
|
S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
|
|
{
|
|
/* Replaces the inversion list in 'dest' with the one from 'src'. It
|
|
* steals the list from 'src', so 'src' is made to have a NULL list. This
|
|
* is similar to what SvSetMagicSV() would do, if it were implemented on
|
|
* inversion lists, though this routine avoids a copy */
|
|
|
|
const UV src_len = invlist_len_(src);
|
|
const bool src_offset = *get_invlist_offset_addr(src);
|
|
const STRLEN src_byte_len = SvLEN(src);
|
|
char * array = SvPVX(src);
|
|
|
|
#ifndef NO_TAINT_SUPPORT
|
|
const int oldtainted = TAINT_get;
|
|
#endif
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
|
|
|
|
assert(is_invlist(src));
|
|
assert(is_invlist(dest));
|
|
assert(! invlist_is_iterating(src));
|
|
assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
|
|
|
|
/* Make sure it ends in the right place with a NUL, as our inversion list
|
|
* manipulations aren't careful to keep this true, but sv_usepvn_flags()
|
|
* asserts it */
|
|
array[src_byte_len - 1] = '\0';
|
|
|
|
TAINT_NOT; /* Otherwise it breaks */
|
|
sv_usepvn_flags(dest,
|
|
(char *) array,
|
|
src_byte_len - 1,
|
|
|
|
/* This flag is documented to cause a copy to be avoided */
|
|
SV_HAS_TRAILING_NUL);
|
|
TAINT_set(oldtainted);
|
|
SvPV_set(src, 0);
|
|
SvLEN_set(src, 0);
|
|
SvCUR_set(src, 0);
|
|
|
|
/* Finish up copying over the other fields in an inversion list */
|
|
*get_invlist_offset_addr(dest) = src_offset;
|
|
invlist_set_len(dest, src_len, src_offset);
|
|
*get_invlist_previous_index_addr(dest) = 0;
|
|
invlist_iterfinish(dest);
|
|
}
|
|
|
|
PERL_STATIC_INLINE IV*
|
|
S_get_invlist_previous_index_addr(SV* invlist)
|
|
{
|
|
/* Return the address of the IV that is reserved to hold the cached index
|
|
* */
|
|
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
|
|
|
|
assert(is_invlist(invlist));
|
|
|
|
return &(((XINVLIST*) SvANY(invlist))->prev_index);
|
|
}
|
|
|
|
PERL_STATIC_INLINE IV
|
|
S_invlist_previous_index(SV* const invlist)
|
|
{
|
|
/* Returns cached index of previous search */
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
|
|
|
|
return *get_invlist_previous_index_addr(invlist);
|
|
}
|
|
|
|
PERL_STATIC_INLINE void
|
|
S_invlist_set_previous_index(SV* const invlist, const IV index)
|
|
{
|
|
/* Caches <index> for later retrieval */
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
|
|
|
|
assert(index == 0 || index < (int) invlist_len_(invlist));
|
|
|
|
*get_invlist_previous_index_addr(invlist) = index;
|
|
}
|
|
|
|
PERL_STATIC_INLINE void
|
|
S_invlist_trim(SV* invlist)
|
|
{
|
|
/* Free the not currently-being-used space in an inversion list */
|
|
|
|
/* But don't free up the space needed for the 0 UV that is always at the
|
|
* beginning of the list, nor the trailing NUL */
|
|
const UV min_size = TO_INTERNAL_SIZE(1) + 1;
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_TRIM;
|
|
|
|
assert(is_invlist(invlist));
|
|
|
|
SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
|
|
}
|
|
|
|
PERL_STATIC_INLINE void
|
|
S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
|
|
{
|
|
PERL_ARGS_ASSERT_INVLIST_CLEAR;
|
|
|
|
assert(is_invlist(invlist));
|
|
|
|
invlist_set_len(invlist, 0, 0);
|
|
invlist_trim(invlist);
|
|
}
|
|
|
|
PERL_STATIC_INLINE UV
|
|
S_invlist_max(const SV* const invlist)
|
|
{
|
|
/* Returns the maximum number of elements storable in the inversion list's
|
|
* array, without having to realloc() */
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_MAX;
|
|
|
|
assert(is_invlist(invlist));
|
|
|
|
/* Assumes worst case, in which the 0 element is not counted in the
|
|
* inversion list, so subtracts 1 for that */
|
|
return SvLEN(invlist) == 0 /* This happens under new_invlist_C_array_ */
|
|
? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
|
|
: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
|
|
}
|
|
|
|
STATIC void
|
|
S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
|
|
{
|
|
PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
|
|
|
|
/* First 1 is in case the zero element isn't in the list; second 1 is for
|
|
* trailing NUL */
|
|
SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
|
|
invlist_set_len(invlist, 0, 0);
|
|
|
|
/* Force iterinit() to be used to get iteration to work */
|
|
invlist_iterfinish(invlist);
|
|
|
|
*get_invlist_previous_index_addr(invlist) = 0;
|
|
SvPOK_on(invlist); /* This allows B to extract the PV */
|
|
}
|
|
|
|
SV*
|
|
Perl_new_invlist_(pTHX_ IV initial_size)
|
|
{
|
|
|
|
/* Return a pointer to a newly constructed inversion list, with enough
|
|
* space to store 'initial_size' elements. If that number is negative, a
|
|
* system default is used instead */
|
|
|
|
SV* new_list;
|
|
|
|
if (initial_size < 0) {
|
|
initial_size = 10;
|
|
}
|
|
|
|
new_list = newSV_type(SVt_INVLIST);
|
|
initialize_invlist_guts(new_list, initial_size);
|
|
|
|
return new_list;
|
|
}
|
|
|
|
SV*
|
|
Perl_new_invlist_C_array_(pTHX_ const UV* const list)
|
|
{
|
|
/* Return a pointer to a newly constructed inversion list, initialized to
|
|
* point to <list>, which has to be in the exact correct inversion list
|
|
* form, including internal fields. Thus this is a dangerous routine that
|
|
* should not be used in the wrong hands. The passed in 'list' contains
|
|
* several header fields at the beginning that are not part of the
|
|
* inversion list body proper */
|
|
|
|
const STRLEN length = (STRLEN) list[0];
|
|
const UV version_id = list[1];
|
|
const bool offset = cBOOL(list[2]);
|
|
#define HEADER_LENGTH 3
|
|
/* If any of the above changes in any way, you must change HEADER_LENGTH
|
|
* (if appropriate) and regenerate INVLIST_VERSION_ID by running
|
|
* perl -E 'say int(rand 2**31-1)'
|
|
*/
|
|
#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
|
|
data structure type, so that one being
|
|
passed in can be validated to be an
|
|
inversion list of the correct vintage.
|
|
*/
|
|
|
|
SV* invlist = newSV_type(SVt_INVLIST);
|
|
|
|
PERL_ARGS_ASSERT_NEW_INVLIST_C_ARRAY_;
|
|
|
|
if (version_id != INVLIST_VERSION_ID) {
|
|
croak("panic: Incorrect version for previously generated inversion list");
|
|
}
|
|
|
|
/* The generated array passed in includes header elements that aren't part
|
|
* of the list proper, so start it just after them */
|
|
SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
|
|
|
|
assert(SvLEN(invlist) == 0); /* Means we own the contents, and the system
|
|
shouldn't touch it */
|
|
|
|
*(get_invlist_offset_addr(invlist)) = offset;
|
|
|
|
/* The 'length' passed to us is the physical number of elements in the
|
|
* inversion list. But if there is an offset the logical number is one
|
|
* less than that */
|
|
invlist_set_len(invlist, length - offset, offset);
|
|
|
|
invlist_set_previous_index(invlist, 0);
|
|
|
|
/* Initialize the iteration pointer. */
|
|
invlist_iterfinish(invlist);
|
|
|
|
SvREADONLY_on(invlist);
|
|
SvPOK_on(invlist);
|
|
|
|
return invlist;
|
|
}
|
|
|
|
STATIC void
|
|
S_append_range_to_invlist_(pTHX_ SV* const invlist,
|
|
const UV start, const UV end)
|
|
{
|
|
/* Subject to change or removal. Append the range from 'start' to 'end' at
|
|
* the end of the inversion list. The range must be above any existing
|
|
* ones. */
|
|
|
|
UV* array;
|
|
UV max = invlist_max(invlist);
|
|
UV len = invlist_len_(invlist);
|
|
bool offset;
|
|
|
|
PERL_ARGS_ASSERT_APPEND_RANGE_TO_INVLIST_;
|
|
|
|
if (len == 0) { /* Empty lists must be initialized */
|
|
offset = start != 0;
|
|
array = invlist_array_init_(invlist, ! offset);
|
|
}
|
|
else {
|
|
/* Here, the existing list is non-empty. The current max entry in the
|
|
* list is generally the first value not in the set, except when the
|
|
* set extends to the end of permissible values, in which case it is
|
|
* the first entry in that final set, and so this call is an attempt to
|
|
* append out-of-order */
|
|
|
|
UV final_element = len - 1;
|
|
array = invlist_array(invlist);
|
|
if ( array[final_element] > start
|
|
|| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
|
|
{
|
|
croak("panic: attempting to append to an inversion list, but "
|
|
"wasn't at the end of the list, final = %" UVuf
|
|
", start = %" UVuf ", match = %c",
|
|
array[final_element], start,
|
|
ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
|
|
}
|
|
|
|
/* Here, it is a legal append. If the new range begins 1 above the end
|
|
* of the range below it, it is extending the range below it, so the
|
|
* new first value not in the set is one greater than the newly
|
|
* extended range. */
|
|
offset = *get_invlist_offset_addr(invlist);
|
|
if (array[final_element] == start) {
|
|
if (end != UV_MAX) {
|
|
array[final_element] = end + 1;
|
|
}
|
|
else {
|
|
/* But if the end is the maximum representable on the machine,
|
|
* assume that infinity was actually what was meant. Just let
|
|
* the range that this would extend to have no end */
|
|
invlist_set_len(invlist, len - 1, offset);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* Here the new range doesn't extend any existing set. Add it */
|
|
|
|
len += 2; /* Includes an element each for the start and end of range */
|
|
|
|
/* If wll overflow the existing space, extend, which may cause the array to
|
|
* be moved */
|
|
if (max < len) {
|
|
invlist_extend(invlist, len);
|
|
|
|
/* Have to set len here to avoid assert failure in invlist_array() */
|
|
invlist_set_len(invlist, len, offset);
|
|
|
|
array = invlist_array(invlist);
|
|
}
|
|
else {
|
|
invlist_set_len(invlist, len, offset);
|
|
}
|
|
|
|
/* The next item on the list starts the range, the one after that is
|
|
* one past the new range. */
|
|
array[len - 2] = start;
|
|
if (end != UV_MAX) {
|
|
array[len - 1] = end + 1;
|
|
}
|
|
else {
|
|
/* But if the end is the maximum representable on the machine, just let
|
|
* the range have no end */
|
|
invlist_set_len(invlist, len - 1, offset);
|
|
}
|
|
}
|
|
|
|
SSize_t
|
|
Perl_invlist_search_(SV* const invlist, const UV cp)
|
|
{
|
|
/* Searches the inversion list for the entry that contains the input code
|
|
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
|
|
* return value is the index into the list's array of the range that
|
|
* contains <cp>, that is, 'i' such that
|
|
* array[i] <= cp < array[i+1]
|
|
*/
|
|
|
|
IV low = 0;
|
|
IV mid;
|
|
IV high = invlist_len_(invlist);
|
|
const IV highest_element = high - 1;
|
|
const UV* array;
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_SEARCH_;
|
|
|
|
/* If list is empty, return failure. */
|
|
if (UNLIKELY(high == 0)) {
|
|
return -1;
|
|
}
|
|
|
|
/* (We can't get the array unless we know the list is non-empty) */
|
|
array = invlist_array(invlist);
|
|
|
|
mid = invlist_previous_index(invlist);
|
|
assert(mid >=0);
|
|
if (UNLIKELY(mid > highest_element)) {
|
|
mid = highest_element;
|
|
}
|
|
|
|
/* <mid> contains the cache of the result of the previous call to this
|
|
* function (0 the first time). See if this call is for the same result,
|
|
* or if it is for mid-1. This is under the theory that calls to this
|
|
* function will often be for related code points that are near each other.
|
|
* And benchmarks show that caching gives better results. We also test
|
|
* here if the code point is within the bounds of the list. These tests
|
|
* replace others that would have had to be made anyway to make sure that
|
|
* the array bounds were not exceeded, and these give us extra information
|
|
* at the same time */
|
|
if (cp >= array[mid]) {
|
|
if (cp >= array[highest_element]) {
|
|
return highest_element;
|
|
}
|
|
|
|
/* Here, array[mid] <= cp < array[highest_element]. This means that
|
|
* the final element is not the answer, so can exclude it; it also
|
|
* means that <mid> is not the final element, so can refer to 'mid + 1'
|
|
* safely */
|
|
if (cp < array[mid + 1]) {
|
|
return mid;
|
|
}
|
|
high--;
|
|
low = mid + 1;
|
|
}
|
|
else { /* cp < aray[mid] */
|
|
if (cp < array[0]) { /* Fail if outside the array */
|
|
return -1;
|
|
}
|
|
high = mid;
|
|
if (cp >= array[mid - 1]) {
|
|
goto found_entry;
|
|
}
|
|
}
|
|
|
|
/* Binary search. What we are looking for is <i> such that
|
|
* array[i] <= cp < array[i+1]
|
|
* The loop below converges on the i+1. Note that there may not be an
|
|
* (i+1)th element in the array, and things work nonetheless */
|
|
while (low < high) {
|
|
mid = (low + high) / 2;
|
|
assert(mid <= highest_element);
|
|
if (array[mid] <= cp) { /* cp >= array[mid] */
|
|
low = mid + 1;
|
|
|
|
/* We could do this extra test to exit the loop early.
|
|
if (cp < array[low]) {
|
|
return mid;
|
|
}
|
|
*/
|
|
}
|
|
else { /* cp < array[mid] */
|
|
high = mid;
|
|
}
|
|
}
|
|
|
|
found_entry:
|
|
high--;
|
|
invlist_set_previous_index(invlist, high);
|
|
return high;
|
|
}
|
|
|
|
void
|
|
Perl_invlist_union_maybe_complement_2nd_(pTHX_ SV* const a, SV* const b,
|
|
const bool complement_b, SV** output)
|
|
{
|
|
/* Take the union of two inversion lists and point '*output' to it. On
|
|
* input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
|
|
* even 'a' or 'b'). If to an inversion list, the contents of the original
|
|
* list will be replaced by the union. The first list, 'a', may be
|
|
* NULL, in which case a copy of the second list is placed in '*output'.
|
|
* If 'complement_b' is true, the union is taken of the complement
|
|
* (inversion) of 'b' instead of b itself.
|
|
*
|
|
* The basis for this comes from "Unicode Demystified" Chapter 13 by
|
|
* Richard Gillam, published by Addison-Wesley, and explained at some
|
|
* length there. The preface says to incorporate its examples into your
|
|
* code at your own risk.
|
|
*
|
|
* The algorithm is like a merge sort. */
|
|
|
|
const UV* array_a; /* a's array */
|
|
const UV* array_b;
|
|
UV len_a; /* length of a's array */
|
|
UV len_b;
|
|
|
|
SV* u; /* the resulting union */
|
|
UV* array_u;
|
|
UV len_u = 0;
|
|
|
|
UV i_a = 0; /* current index into a's array */
|
|
UV i_b = 0;
|
|
UV i_u = 0;
|
|
|
|
/* running count, as explained in the algorithm source book; items are
|
|
* stopped accumulating and are output when the count changes to/from 0.
|
|
* The count is incremented when we start a range that's in an input's set,
|
|
* and decremented when we start a range that's not in a set. So this
|
|
* variable can be 0, 1, or 2. When it is 0 neither input is in their set,
|
|
* and hence nothing goes into the union; 1, just one of the inputs is in
|
|
* its set (and its current range gets added to the union); and 2 when both
|
|
* inputs are in their sets. */
|
|
UV count = 0;
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_UNION_MAYBE_COMPLEMENT_2ND_;
|
|
assert(a != b);
|
|
assert(*output == NULL || is_invlist(*output));
|
|
|
|
len_b = invlist_len_(b);
|
|
if (len_b == 0) {
|
|
|
|
/* Here, 'b' is empty, hence it's complement is all possible code
|
|
* points. So if the union includes the complement of 'b', it includes
|
|
* everything, and we need not even look at 'a'. It's easiest to
|
|
* create a new inversion list that matches everything. */
|
|
if (complement_b) {
|
|
SV* everything = add_range_to_invlist_(NULL, 0, UV_MAX);
|
|
|
|
if (*output == NULL) { /* If the output didn't exist, just point it
|
|
at the new list */
|
|
*output = everything;
|
|
}
|
|
else { /* Otherwise, replace its contents with the new list */
|
|
invlist_replace_list_destroys_src(*output, everything);
|
|
SvREFCNT_dec_NN(everything);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/* Here, we don't want the complement of 'b', and since 'b' is empty,
|
|
* the union will come entirely from 'a'. If 'a' is NULL or empty, the
|
|
* output will be empty */
|
|
|
|
if (a == NULL || invlist_len_(a) == 0) {
|
|
if (*output == NULL) {
|
|
*output = new_invlist_(0);
|
|
}
|
|
else {
|
|
invlist_clear(*output);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
|
|
* union. We can just return a copy of 'a' if '*output' doesn't point
|
|
* to an existing list */
|
|
if (*output == NULL) {
|
|
*output = invlist_clone(a, NULL);
|
|
return;
|
|
}
|
|
|
|
/* If the output is to overwrite 'a', we have a no-op, as it's
|
|
* already in 'a' */
|
|
if (*output == a) {
|
|
return;
|
|
}
|
|
|
|
/* Here, '*output' is to be overwritten by 'a' */
|
|
u = invlist_clone(a, NULL);
|
|
invlist_replace_list_destroys_src(*output, u);
|
|
SvREFCNT_dec_NN(u);
|
|
|
|
return;
|
|
}
|
|
|
|
/* Here 'b' is not empty. See about 'a' */
|
|
|
|
if (a == NULL || ((len_a = invlist_len_(a)) == 0)) {
|
|
|
|
/* Here, 'a' is empty (and b is not). That means the union will come
|
|
* entirely from 'b'. If '*output' is NULL, we can directly return a
|
|
* clone of 'b'. Otherwise, we replace the contents of '*output' with
|
|
* the clone */
|
|
|
|
SV ** dest = (*output == NULL) ? output : &u;
|
|
*dest = invlist_clone(b, NULL);
|
|
if (complement_b) {
|
|
invlist_invert_(*dest);
|
|
}
|
|
|
|
if (dest == &u) {
|
|
invlist_replace_list_destroys_src(*output, u);
|
|
SvREFCNT_dec_NN(u);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/* Here both lists exist and are non-empty */
|
|
array_a = invlist_array(a);
|
|
array_b = invlist_array(b);
|
|
|
|
/* If are to take the union of 'a' with the complement of b, set it
|
|
* up so are looking at b's complement. */
|
|
if (complement_b) {
|
|
|
|
/* To complement, we invert: if the first element is 0, remove it. To
|
|
* do this, we just pretend the array starts one later */
|
|
if (array_b[0] == 0) {
|
|
array_b++;
|
|
len_b--;
|
|
}
|
|
else {
|
|
|
|
/* But if the first element is not zero, we pretend the list starts
|
|
* at the 0 that is always stored immediately before the array. */
|
|
array_b--;
|
|
len_b++;
|
|
}
|
|
}
|
|
|
|
/* Size the union for the worst case: that the sets are completely
|
|
* disjoint */
|
|
u = new_invlist_(len_a + len_b);
|
|
|
|
/* Will contain U+0000 if either component does */
|
|
array_u = invlist_array_init_(u, ( len_a > 0 && array_a[0] == 0)
|
|
|| (len_b > 0 && array_b[0] == 0));
|
|
|
|
/* Go through each input list item by item, stopping when have exhausted
|
|
* one of them */
|
|
while (i_a < len_a && i_b < len_b) {
|
|
UV cp; /* The element to potentially add to the union's array */
|
|
bool cp_in_set; /* is it in the input list's set or not */
|
|
|
|
/* We need to take one or the other of the two inputs for the union.
|
|
* Since we are merging two sorted lists, we take the smaller of the
|
|
* next items. In case of a tie, we take first the one that is in its
|
|
* set. If we first took the one not in its set, it would decrement
|
|
* the count, possibly to 0 which would cause it to be output as ending
|
|
* the range, and the next time through we would take the same number,
|
|
* and output it again as beginning the next range. By doing it the
|
|
* opposite way, there is no possibility that the count will be
|
|
* momentarily decremented to 0, and thus the two adjoining ranges will
|
|
* be seamlessly merged. (In a tie and both are in the set or both not
|
|
* in the set, it doesn't matter which we take first.) */
|
|
if ( array_a[i_a] < array_b[i_b]
|
|
|| ( array_a[i_a] == array_b[i_b]
|
|
&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
|
|
{
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
|
|
cp = array_a[i_a++];
|
|
}
|
|
else {
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
|
|
cp = array_b[i_b++];
|
|
}
|
|
|
|
/* Here, have chosen which of the two inputs to look at. Only output
|
|
* if the running count changes to/from 0, which marks the
|
|
* beginning/end of a range that's in the set */
|
|
if (cp_in_set) {
|
|
if (count == 0) {
|
|
array_u[i_u++] = cp;
|
|
}
|
|
count++;
|
|
}
|
|
else {
|
|
count--;
|
|
if (count == 0) {
|
|
array_u[i_u++] = cp;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* The loop above increments the index into exactly one of the input lists
|
|
* each iteration, and ends when either index gets to its list end. That
|
|
* means the other index is lower than its end, and so something is
|
|
* remaining in that one. We decrement 'count', as explained below, if
|
|
* that list is in its set. (i_a and i_b each currently index the element
|
|
* beyond the one we care about.) */
|
|
if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
|
|
|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
|
|
{
|
|
count--;
|
|
}
|
|
|
|
/* Above we decremented 'count' if the list that had unexamined elements in
|
|
* it was in its set. This has made it so that 'count' being non-zero
|
|
* means there isn't anything left to output; and 'count' equal to 0 means
|
|
* that what is left to output is precisely that which is left in the
|
|
* non-exhausted input list.
|
|
*
|
|
* To see why, note first that the exhausted input obviously has nothing
|
|
* left to add to the union. If it was in its set at its end, that means
|
|
* the set extends from here to the platform's infinity, and hence so does
|
|
* the union and the non-exhausted set is irrelevant. The exhausted set
|
|
* also contributed 1 to 'count'. If 'count' was 2, it got decremented to
|
|
* 1, but if it was 1, the non-exhausted set wasn't in its set, and so
|
|
* 'count' remains at 1. This is consistent with the decremented 'count'
|
|
* != 0 meaning there's nothing left to add to the union.
|
|
*
|
|
* But if the exhausted input wasn't in its set, it contributed 0 to
|
|
* 'count', and the rest of the union will be whatever the other input is.
|
|
* If 'count' was 0, neither list was in its set, and 'count' remains 0;
|
|
* otherwise it gets decremented to 0. This is consistent with 'count'
|
|
* == 0 meaning the remainder of the union is whatever is left in the
|
|
* non-exhausted list. */
|
|
if (count != 0) {
|
|
len_u = i_u;
|
|
}
|
|
else {
|
|
IV copy_count = len_a - i_a;
|
|
if (copy_count > 0) { /* The non-exhausted input is 'a' */
|
|
Copy(array_a + i_a, array_u + i_u, copy_count, UV);
|
|
}
|
|
else { /* The non-exhausted input is b */
|
|
copy_count = len_b - i_b;
|
|
Copy(array_b + i_b, array_u + i_u, copy_count, UV);
|
|
}
|
|
len_u = i_u + copy_count;
|
|
}
|
|
|
|
/* Set the result to the final length, which can change the pointer to
|
|
* array_u, so re-find it. (Note that it is unlikely that this will
|
|
* change, as we are shrinking the space, not enlarging it) */
|
|
if (len_u != invlist_len_(u)) {
|
|
invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
|
|
invlist_trim(u);
|
|
array_u = invlist_array(u);
|
|
}
|
|
|
|
if (*output == NULL) { /* Simply return the new inversion list */
|
|
*output = u;
|
|
}
|
|
else {
|
|
/* Otherwise, overwrite the inversion list that was in '*output'. We
|
|
* could instead free '*output', and then set it to 'u', but experience
|
|
* has shown [perl #127392] that if the input is a mortal, we can get a
|
|
* huge build-up of these during regex compilation before they get
|
|
* freed. */
|
|
invlist_replace_list_destroys_src(*output, u);
|
|
SvREFCNT_dec_NN(u);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void
|
|
Perl_invlist_intersection_maybe_complement_2nd_(pTHX_ SV* const a, SV* const b,
|
|
const bool complement_b, SV** i)
|
|
{
|
|
/* Take the intersection of two inversion lists and point '*i' to it. On
|
|
* input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
|
|
* even 'a' or 'b'). If to an inversion list, the contents of the original
|
|
* list will be replaced by the intersection. The first list, 'a', may be
|
|
* NULL, in which case '*i' will be an empty list. If 'complement_b' is
|
|
* true, the result will be the intersection of 'a' and the complement (or
|
|
* inversion) of 'b' instead of 'b' directly.
|
|
*
|
|
* The basis for this comes from "Unicode Demystified" Chapter 13 by
|
|
* Richard Gillam, published by Addison-Wesley, and explained at some
|
|
* length there. The preface says to incorporate its examples into your
|
|
* code at your own risk. In fact, it had bugs
|
|
*
|
|
* The algorithm is like a merge sort, and is essentially the same as the
|
|
* union above
|
|
*/
|
|
|
|
const UV* array_a; /* a's array */
|
|
const UV* array_b;
|
|
UV len_a; /* length of a's array */
|
|
UV len_b;
|
|
|
|
SV* r; /* the resulting intersection */
|
|
UV* array_r;
|
|
UV len_r = 0;
|
|
|
|
UV i_a = 0; /* current index into a's array */
|
|
UV i_b = 0;
|
|
UV i_r = 0;
|
|
|
|
/* running count of how many of the two inputs are postitioned at ranges
|
|
* that are in their sets. As explained in the algorithm source book,
|
|
* items are stopped accumulating and are output when the count changes
|
|
* to/from 2. The count is incremented when we start a range that's in an
|
|
* input's set, and decremented when we start a range that's not in a set.
|
|
* Only when it is 2 are we in the intersection. */
|
|
UV count = 0;
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND_;
|
|
assert(a != b);
|
|
assert(*i == NULL || is_invlist(*i));
|
|
|
|
/* Special case if either one is empty */
|
|
len_a = (a == NULL) ? 0 : invlist_len_(a);
|
|
if ((len_a == 0) || ((len_b = invlist_len_(b)) == 0)) {
|
|
if (len_a != 0 && complement_b) {
|
|
|
|
/* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
|
|
* must be empty. Here, also we are using 'b's complement, which
|
|
* hence must be every possible code point. Thus the intersection
|
|
* is simply 'a'. */
|
|
|
|
if (*i == a) { /* No-op */
|
|
return;
|
|
}
|
|
|
|
if (*i == NULL) {
|
|
*i = invlist_clone(a, NULL);
|
|
return;
|
|
}
|
|
|
|
r = invlist_clone(a, NULL);
|
|
invlist_replace_list_destroys_src(*i, r);
|
|
SvREFCNT_dec_NN(r);
|
|
return;
|
|
}
|
|
|
|
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
|
|
* intersection must be empty */
|
|
if (*i == NULL) {
|
|
*i = new_invlist_(0);
|
|
return;
|
|
}
|
|
|
|
invlist_clear(*i);
|
|
return;
|
|
}
|
|
|
|
/* Here both lists exist and are non-empty */
|
|
array_a = invlist_array(a);
|
|
array_b = invlist_array(b);
|
|
|
|
/* If are to take the intersection of 'a' with the complement of b, set it
|
|
* up so are looking at b's complement. */
|
|
if (complement_b) {
|
|
|
|
/* To complement, we invert: if the first element is 0, remove it. To
|
|
* do this, we just pretend the array starts one later */
|
|
if (array_b[0] == 0) {
|
|
array_b++;
|
|
len_b--;
|
|
}
|
|
else {
|
|
|
|
/* But if the first element is not zero, we pretend the list starts
|
|
* at the 0 that is always stored immediately before the array. */
|
|
array_b--;
|
|
len_b++;
|
|
}
|
|
}
|
|
|
|
/* Size the intersection for the worst case: that the intersection ends up
|
|
* fragmenting everything to be completely disjoint */
|
|
r = new_invlist_(len_a + len_b);
|
|
|
|
/* Will contain U+0000 iff both components do */
|
|
array_r = invlist_array_init_(r, len_a > 0 && array_a[0] == 0
|
|
&& len_b > 0 && array_b[0] == 0);
|
|
|
|
/* Go through each list item by item, stopping when have exhausted one of
|
|
* them */
|
|
while (i_a < len_a && i_b < len_b) {
|
|
UV cp; /* The element to potentially add to the intersection's
|
|
array */
|
|
bool cp_in_set; /* Is it in the input list's set or not */
|
|
|
|
/* We need to take one or the other of the two inputs for the
|
|
* intersection. Since we are merging two sorted lists, we take the
|
|
* smaller of the next items. In case of a tie, we take first the one
|
|
* that is not in its set (a difference from the union algorithm). If
|
|
* we first took the one in its set, it would increment the count,
|
|
* possibly to 2 which would cause it to be output as starting a range
|
|
* in the intersection, and the next time through we would take that
|
|
* same number, and output it again as ending the set. By doing the
|
|
* opposite of this, there is no possibility that the count will be
|
|
* momentarily incremented to 2. (In a tie and both are in the set or
|
|
* both not in the set, it doesn't matter which we take first.) */
|
|
if ( array_a[i_a] < array_b[i_b]
|
|
|| ( array_a[i_a] == array_b[i_b]
|
|
&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
|
|
{
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
|
|
cp = array_a[i_a++];
|
|
}
|
|
else {
|
|
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
|
|
cp = array_b[i_b++];
|
|
}
|
|
|
|
/* Here, have chosen which of the two inputs to look at. Only output
|
|
* if the running count changes to/from 2, which marks the
|
|
* beginning/end of a range that's in the intersection */
|
|
if (cp_in_set) {
|
|
count++;
|
|
if (count == 2) {
|
|
array_r[i_r++] = cp;
|
|
}
|
|
}
|
|
else {
|
|
if (count == 2) {
|
|
array_r[i_r++] = cp;
|
|
}
|
|
count--;
|
|
}
|
|
|
|
}
|
|
|
|
/* The loop above increments the index into exactly one of the input lists
|
|
* each iteration, and ends when either index gets to its list end. That
|
|
* means the other index is lower than its end, and so something is
|
|
* remaining in that one. We increment 'count', as explained below, if the
|
|
* exhausted list was in its set. (i_a and i_b each currently index the
|
|
* element beyond the one we care about.) */
|
|
if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
|
|
|| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
|
|
{
|
|
count++;
|
|
}
|
|
|
|
/* Above we incremented 'count' if the exhausted list was in its set. This
|
|
* has made it so that 'count' being below 2 means there is nothing left to
|
|
* output; otheriwse what's left to add to the intersection is precisely
|
|
* that which is left in the non-exhausted input list.
|
|
*
|
|
* To see why, note first that the exhausted input obviously has nothing
|
|
* left to affect the intersection. If it was in its set at its end, that
|
|
* means the set extends from here to the platform's infinity, and hence
|
|
* anything in the non-exhausted's list will be in the intersection, and
|
|
* anything not in it won't be. Hence, the rest of the intersection is
|
|
* precisely what's in the non-exhausted list The exhausted set also
|
|
* contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
|
|
* it means 'count' is now at least 2. This is consistent with the
|
|
* incremented 'count' being >= 2 means to add the non-exhausted list to
|
|
* the intersection.
|
|
*
|
|
* But if the exhausted input wasn't in its set, it contributed 0 to
|
|
* 'count', and the intersection can't include anything further; the
|
|
* non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
|
|
* incremented. This is consistent with 'count' being < 2 meaning nothing
|
|
* further to add to the intersection. */
|
|
if (count < 2) { /* Nothing left to put in the intersection. */
|
|
len_r = i_r;
|
|
}
|
|
else { /* copy the non-exhausted list, unchanged. */
|
|
IV copy_count = len_a - i_a;
|
|
if (copy_count > 0) { /* a is the one with stuff left */
|
|
Copy(array_a + i_a, array_r + i_r, copy_count, UV);
|
|
}
|
|
else { /* b is the one with stuff left */
|
|
copy_count = len_b - i_b;
|
|
Copy(array_b + i_b, array_r + i_r, copy_count, UV);
|
|
}
|
|
len_r = i_r + copy_count;
|
|
}
|
|
|
|
/* Set the result to the final length, which can change the pointer to
|
|
* array_r, so re-find it. (Note that it is unlikely that this will
|
|
* change, as we are shrinking the space, not enlarging it) */
|
|
if (len_r != invlist_len_(r)) {
|
|
invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
|
|
invlist_trim(r);
|
|
array_r = invlist_array(r);
|
|
}
|
|
|
|
if (*i == NULL) { /* Simply return the calculated intersection */
|
|
*i = r;
|
|
}
|
|
else { /* Otherwise, replace the existing inversion list in '*i'. We could
|
|
instead free '*i', and then set it to 'r', but experience has
|
|
shown [perl #127392] that if the input is a mortal, we can get a
|
|
huge build-up of these during regex compilation before they get
|
|
freed. */
|
|
if (len_r) {
|
|
invlist_replace_list_destroys_src(*i, r);
|
|
}
|
|
else {
|
|
invlist_clear(*i);
|
|
}
|
|
SvREFCNT_dec_NN(r);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
SV*
|
|
Perl_add_range_to_invlist_(pTHX_ SV* invlist, UV start, UV end)
|
|
{
|
|
/* Add the range from 'start' to 'end' inclusive to the inversion list's
|
|
* set. A pointer to the inversion list is returned. This may actually be
|
|
* a new list, in which case the passed in one has been destroyed. The
|
|
* passed-in inversion list can be NULL, in which case a new one is created
|
|
* with just the one range in it. The new list is not necessarily
|
|
* NUL-terminated. Space is not freed if the inversion list shrinks as a
|
|
* result of this function. The gain would not be large, and in many
|
|
* cases, this is called multiple times on a single inversion list, so
|
|
* anything freed may almost immediately be needed again.
|
|
*
|
|
* This used to mostly call the 'union' routine, but that is much more
|
|
* heavyweight than really needed for a single range addition */
|
|
|
|
UV* array; /* The array implementing the inversion list */
|
|
UV len; /* How many elements in 'array' */
|
|
SSize_t i_s; /* index into the invlist array where 'start'
|
|
should go */
|
|
SSize_t i_e = 0; /* And the index where 'end' should go */
|
|
UV cur_highest; /* The highest code point in the inversion list
|
|
upon entry to this function */
|
|
|
|
/* This range becomes the whole inversion list if none already existed */
|
|
if (invlist == NULL) {
|
|
invlist = new_invlist_(2);
|
|
append_range_to_invlist_(invlist, start, end);
|
|
return invlist;
|
|
}
|
|
|
|
/* Likewise, if the inversion list is currently empty */
|
|
len = invlist_len_(invlist);
|
|
if (len == 0) {
|
|
append_range_to_invlist_(invlist, start, end);
|
|
return invlist;
|
|
}
|
|
|
|
/* Starting here, we have to know the internals of the list */
|
|
array = invlist_array(invlist);
|
|
|
|
/* If the new range ends higher than the current highest ... */
|
|
cur_highest = invlist_highest(invlist);
|
|
if (end > cur_highest) {
|
|
|
|
/* If the whole range is higher, we can just append it */
|
|
if (start > cur_highest) {
|
|
append_range_to_invlist_(invlist, start, end);
|
|
return invlist;
|
|
}
|
|
|
|
/* Otherwise, add the portion that is higher ... */
|
|
append_range_to_invlist_(invlist, cur_highest + 1, end);
|
|
|
|
/* ... and continue on below to handle the rest. As a result of the
|
|
* above append, we know that the index of the end of the range is the
|
|
* final even numbered one of the array. Recall that the final element
|
|
* always starts a range that extends to infinity. If that range is in
|
|
* the set (meaning the set goes from here to infinity), it will be an
|
|
* even index, but if it isn't in the set, it's odd, and the final
|
|
* range in the set is one less, which is even. */
|
|
if (end == UV_MAX) {
|
|
i_e = len;
|
|
}
|
|
else {
|
|
i_e = len - 2;
|
|
}
|
|
}
|
|
|
|
/* We have dealt with appending, now see about prepending. If the new
|
|
* range starts lower than the current lowest ... */
|
|
if (start < array[0]) {
|
|
|
|
/* Adding something which has 0 in it is somewhat tricky, and uncommon.
|
|
* Let the union code handle it, rather than having to know the
|
|
* trickiness in two code places. */
|
|
if (UNLIKELY(start == 0)) {
|
|
SV* range_invlist;
|
|
|
|
range_invlist = new_invlist_(2);
|
|
append_range_to_invlist_(range_invlist, start, end);
|
|
|
|
invlist_union_(invlist, range_invlist, &invlist);
|
|
|
|
SvREFCNT_dec_NN(range_invlist);
|
|
|
|
return invlist;
|
|
}
|
|
|
|
/* If the whole new range comes before the first entry, and doesn't
|
|
* extend it, we have to insert it as an additional range */
|
|
if (end < array[0] - 1) {
|
|
i_s = i_e = -1;
|
|
goto splice_in_new_range;
|
|
}
|
|
|
|
/* Here the new range adjoins the existing first range, extending it
|
|
* downwards. */
|
|
array[0] = start;
|
|
|
|
/* And continue on below to handle the rest. We know that the index of
|
|
* the beginning of the range is the first one of the array */
|
|
i_s = 0;
|
|
}
|
|
else { /* Not prepending any part of the new range to the existing list.
|
|
* Find where in the list it should go. This finds i_s, such that:
|
|
* invlist[i_s] <= start < array[i_s+1]
|
|
*/
|
|
i_s = invlist_search_(invlist, start);
|
|
}
|
|
|
|
/* At this point, any extending before the beginning of the inversion list
|
|
* and/or after the end has been done. This has made it so that, in the
|
|
* code below, each endpoint of the new range is either in a range that is
|
|
* in the set, or is in a gap between two ranges that are. This means we
|
|
* don't have to worry about exceeding the array bounds.
|
|
*
|
|
* Find where in the list the new range ends (but we can skip this if we
|
|
* have already determined what it is, or if it will be the same as i_s,
|
|
* which we already have computed) */
|
|
if (i_e == 0) {
|
|
i_e = (start == end)
|
|
? i_s
|
|
: invlist_search_(invlist, end);
|
|
}
|
|
|
|
/* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
|
|
* is a range that goes to infinity there is no element at invlist[i_e+1],
|
|
* so only the first relation holds. */
|
|
|
|
if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
|
|
|
|
/* Here, the ranges on either side of the beginning of the new range
|
|
* are in the set, and this range starts in the gap between them.
|
|
*
|
|
* The new range extends the range above it downwards if the new range
|
|
* ends at or above that range's start */
|
|
const bool extends_the_range_above = ( end == UV_MAX
|
|
|| end + 1 >= array[i_s+1]);
|
|
|
|
/* The new range extends the range below it upwards if it begins just
|
|
* after where that range ends */
|
|
if (start == array[i_s]) {
|
|
|
|
/* If the new range fills the entire gap between the other ranges,
|
|
* they will get merged together. Other ranges may also get
|
|
* merged, depending on how many of them the new range spans. In
|
|
* the general case, we do the merge later, just once, after we
|
|
* figure out how many to merge. But in the case where the new
|
|
* range exactly spans just this one gap (possibly extending into
|
|
* the one above), we do the merge here, and an early exit. This
|
|
* is done here to avoid having to special case later. */
|
|
if (i_e - i_s <= 1) {
|
|
|
|
/* If i_e - i_s == 1, it means that the new range terminates
|
|
* within the range above, and hence 'extends_the_range_above'
|
|
* must be true. (If the range above it extends to infinity,
|
|
* 'i_s+2' will be above the array's limit, but 'len-i_s-2'
|
|
* will be 0, so no harm done.) */
|
|
if (extends_the_range_above) {
|
|
Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
|
|
invlist_set_len(invlist,
|
|
len - 2,
|
|
*(get_invlist_offset_addr(invlist)));
|
|
return invlist;
|
|
}
|
|
|
|
/* Here, i_e must == i_s. We keep them in sync, as they apply
|
|
* to the same range, and below we are about to decrement i_s
|
|
* */
|
|
i_e--;
|
|
}
|
|
|
|
/* Here, the new range is adjacent to the one below. (It may also
|
|
* span beyond the range above, but that will get resolved later.)
|
|
* Extend the range below to include this one. */
|
|
array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
|
|
i_s--;
|
|
start = array[i_s];
|
|
}
|
|
else if (extends_the_range_above) {
|
|
|
|
/* Here the new range only extends the range above it, but not the
|
|
* one below. It merges with the one above. Again, we keep i_e
|
|
* and i_s in sync if they point to the same range */
|
|
if (i_e == i_s) {
|
|
i_e++;
|
|
}
|
|
i_s++;
|
|
array[i_s] = start;
|
|
}
|
|
}
|
|
|
|
/* Here, we've dealt with the new range start extending any adjoining
|
|
* existing ranges.
|
|
*
|
|
* If the new range extends to infinity, it is now the final one,
|
|
* regardless of what was there before */
|
|
if (UNLIKELY(end == UV_MAX)) {
|
|
invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
|
|
return invlist;
|
|
}
|
|
|
|
/* If i_e started as == i_s, it has also been dealt with,
|
|
* and been updated to the new i_s, which will fail the following if */
|
|
if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
|
|
|
|
/* Here, the ranges on either side of the end of the new range are in
|
|
* the set, and this range ends in the gap between them.
|
|
*
|
|
* If this range is adjacent to (hence extends) the range above it, it
|
|
* becomes part of that range; likewise if it extends the range below,
|
|
* it becomes part of that range */
|
|
if (end + 1 == array[i_e+1]) {
|
|
i_e++;
|
|
array[i_e] = start;
|
|
}
|
|
else if (start <= array[i_e]) {
|
|
array[i_e] = end + 1;
|
|
i_e--;
|
|
}
|
|
}
|
|
|
|
if (i_s == i_e) {
|
|
|
|
/* If the range fits entirely in an existing range (as possibly already
|
|
* extended above), it doesn't add anything new */
|
|
if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
|
|
return invlist;
|
|
}
|
|
|
|
/* Here, no part of the range is in the list. Must add it. It will
|
|
* occupy 2 more slots */
|
|
splice_in_new_range:
|
|
|
|
invlist_extend(invlist, len + 2);
|
|
array = invlist_array(invlist);
|
|
/* Move the rest of the array down two slots. Don't include any
|
|
* trailing NUL */
|
|
Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
|
|
|
|
/* Do the actual splice */
|
|
array[i_e+1] = start;
|
|
array[i_e+2] = end + 1;
|
|
invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
|
|
return invlist;
|
|
}
|
|
|
|
/* Here the new range crossed the boundaries of a pre-existing range. The
|
|
* code above has adjusted things so that both ends are in ranges that are
|
|
* in the set. This means everything in between must also be in the set.
|
|
* Just squash things together */
|
|
Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
|
|
invlist_set_len(invlist,
|
|
len - i_e + i_s,
|
|
*(get_invlist_offset_addr(invlist)));
|
|
|
|
return invlist;
|
|
}
|
|
|
|
SV*
|
|
Perl_setup_canned_invlist_(pTHX_ const STRLEN size, const UV element0,
|
|
UV** other_elements_ptr)
|
|
{
|
|
/* Create and return an inversion list whose contents are to be populated
|
|
* by the caller. The caller gives the number of elements (in 'size') and
|
|
* the very first element ('element0'). This function will set
|
|
* '*other_elements_ptr' to an array of UVs, where the remaining elements
|
|
* are to be placed.
|
|
*
|
|
* Obviously there is some trust involved that the caller will properly
|
|
* fill in the other elements of the array.
|
|
*
|
|
* (The first element needs to be passed in, as the underlying code does
|
|
* things differently depending on whether it is zero or non-zero) */
|
|
|
|
SV* invlist = new_invlist_(size);
|
|
bool offset;
|
|
|
|
PERL_ARGS_ASSERT_SETUP_CANNED_INVLIST_;
|
|
|
|
invlist = add_cp_to_invlist(invlist, element0);
|
|
offset = *get_invlist_offset_addr(invlist);
|
|
|
|
invlist_set_len(invlist, size, offset);
|
|
*other_elements_ptr = invlist_array(invlist) + 1;
|
|
return invlist;
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifndef PERL_IN_XSUB_RE
|
|
void
|
|
Perl_invlist_invert_(pTHX_ SV* const invlist)
|
|
{
|
|
/* Complement the input inversion list. This adds a 0 if the list didn't
|
|
* have a zero; removes it otherwise. As described above, the data
|
|
* structure is set up so that this is very efficient */
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_INVERT_;
|
|
|
|
assert(! invlist_is_iterating(invlist));
|
|
|
|
/* The inverse of matching nothing is matching everything */
|
|
if (invlist_len_(invlist) == 0) {
|
|
append_range_to_invlist_(invlist, 0, UV_MAX);
|
|
return;
|
|
}
|
|
|
|
*get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
|
|
}
|
|
|
|
SV*
|
|
Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
|
|
{
|
|
/* Return a new inversion list that is a copy of the input one, which is
|
|
* unchanged. The new list will not be mortal even if the old one was. */
|
|
|
|
const STRLEN nominal_length = invlist_len_(invlist);
|
|
const STRLEN physical_length = SvCUR(invlist);
|
|
const bool offset = *(get_invlist_offset_addr(invlist));
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_CLONE;
|
|
|
|
if (new_invlist == NULL) {
|
|
new_invlist = new_invlist_(nominal_length);
|
|
}
|
|
else {
|
|
sv_upgrade(new_invlist, SVt_INVLIST);
|
|
initialize_invlist_guts(new_invlist, nominal_length);
|
|
}
|
|
|
|
*(get_invlist_offset_addr(new_invlist)) = offset;
|
|
invlist_set_len(new_invlist, nominal_length, offset);
|
|
Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
|
|
|
|
return new_invlist;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef PERL_IN_XSUB_RE
|
|
void
|
|
Perl_invlist_dump_(pTHX_ PerlIO *file, I32 level,
|
|
const char * const indent, SV* const invlist)
|
|
{
|
|
/* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
|
|
* inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
|
|
* the string 'indent'. The output looks like this:
|
|
[0] 0x000A .. 0x000D
|
|
[2] 0x0085
|
|
[4] 0x2028 .. 0x2029
|
|
[6] 0x3104 .. INFTY
|
|
* This means that the first range of code points matched by the list are
|
|
* 0xA through 0xD; the second range contains only the single code point
|
|
* 0x85, etc. An inversion list is an array of UVs. Two array elements
|
|
* are used to define each range (except if the final range extends to
|
|
* infinity, only a single element is needed). The array index of the
|
|
* first element for the corresponding range is given in brackets. */
|
|
|
|
UV start, end;
|
|
STRLEN count = 0;
|
|
|
|
PERL_ARGS_ASSERT_INVLIST_DUMP_;
|
|
|
|
if (invlist_is_iterating(invlist)) {
|
|
Perl_dump_indent(aTHX_ level, file,
|
|
"%sCan't dump inversion list because is in middle of iterating\n",
|
|
indent);
|
|
return;
|
|
}
|
|
|
|
invlist_iterinit(invlist);
|
|
while (invlist_iternext(invlist, &start, &end)) {
|
|
if (end == UV_MAX) {
|
|
Perl_dump_indent(aTHX_ level, file,
|
|
"%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
|
|
indent, (UV)count, start);
|
|
}
|
|
else if (end != start) {
|
|
Perl_dump_indent(aTHX_ level, file,
|
|
"%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
|
|
indent, (UV)count, start, end);
|
|
}
|
|
else {
|
|
Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
|
|
indent, (UV)count, start);
|
|
}
|
|
count += 2;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined(PERL_ARGS_ASSERT_INVLISTEQ_) && !defined(PERL_IN_XSUB_RE)
|
|
bool
|
|
Perl_invlistEQ_(pTHX_ SV* const a, SV* const b, const bool complement_b)
|
|
{
|
|
/* Return a boolean as to if the two passed in inversion lists are
|
|
* identical. The final argument, if true, says to take the complement of
|
|
* the second inversion list before doing the comparison */
|
|
|
|
const UV len_a = invlist_len_(a);
|
|
UV len_b = invlist_len_(b);
|
|
|
|
const UV* array_a = NULL;
|
|
const UV* array_b = NULL;
|
|
|
|
PERL_ARGS_ASSERT_INVLISTEQ_;
|
|
|
|
/* This code avoids accessing the arrays unless it knows the length is
|
|
* non-zero */
|
|
|
|
if (len_a == 0) {
|
|
if (len_b == 0) {
|
|
return ! complement_b;
|
|
}
|
|
}
|
|
else {
|
|
array_a = invlist_array(a);
|
|
}
|
|
|
|
if (len_b != 0) {
|
|
array_b = invlist_array(b);
|
|
}
|
|
|
|
/* If are to compare 'a' with the complement of b, set it
|
|
* up so are looking at b's complement. */
|
|
if (complement_b) {
|
|
|
|
/* The complement of nothing is everything, so <a> would have to have
|
|
* just one element, starting at zero (ending at infinity) */
|
|
if (len_b == 0) {
|
|
return (len_a == 1 && array_a[0] == 0);
|
|
}
|
|
if (array_b[0] == 0) {
|
|
|
|
/* Otherwise, to complement, we invert. Here, the first element is
|
|
* 0, just remove it. To do this, we just pretend the array starts
|
|
* one later */
|
|
|
|
array_b++;
|
|
len_b--;
|
|
}
|
|
else {
|
|
|
|
/* But if the first element is not zero, we pretend the list starts
|
|
* at the 0 that is always stored immediately before the array. */
|
|
array_b--;
|
|
len_b++;
|
|
}
|
|
}
|
|
|
|
return len_a == len_b
|
|
&& memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
|
|
|
|
}
|
|
#endif
|
|
|
|
#undef HEADER_LENGTH
|
|
#undef TO_INTERNAL_SIZE
|
|
#undef FROM_INTERNAL_SIZE
|
|
#undef INVLIST_VERSION_ID
|
|
|
|
/* End of inversion list object */
|