[ruby/strscan] Add a method for peeking and reading bytes as

integers
(https://github.com/ruby/strscan/pull/89)

This commit adds `scan_byte` and `peek_byte`. `scan_byte` will scan the
current byte, return it as an integer, and advance the cursor.
`peek_byte` will return the current byte as an integer without advancing
the cursor.

Currently `StringScanner#get_byte` returns a string, but I want to get
the current byte without allocating a string. I think this will help
with writing high performance lexers.

---------

https://github.com/ruby/strscan/commit/873aba2e5d

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
This commit is contained in:
Aaron Patterson 2024-02-25 16:45:08 -08:00 committed by Hiroshi SHIBATA
parent 7176c186d0
commit 164e464b04
2 changed files with 78 additions and 0 deletions

View File

@ -902,6 +902,57 @@ strscan_getch(VALUE self)
adjust_register_position(p, p->regs.end[0]));
}
/*
* Scans one byte and returns it as an integer.
* This method is not multibyte character sensitive.
* See also: #getch.
*
* s = StringScanner.new('ab')
* s.scan_byte # => 97
* s.scan_byte # => 98
* s.scan_byte # => nil
*
* s = StringScanner.new("\244\242".force_encoding("euc-jp"))
* s.scan_byte # => 0xA4
* s.scan_byte # => 0xA2
* s.scan_byte # => nil
*/
static VALUE
strscan_scan_byte(VALUE self)
{
struct strscanner *p;
GET_SCANNER(self, p);
CLEAR_MATCH_STATUS(p);
if (EOS_P(p))
return Qnil;
VALUE byte = INT2FIX((unsigned char)*CURPTR(p));
p->prev = p->curr;
p->curr++;
MATCHED(p);
adjust_registers_to_matched(p);
return byte;
}
/*
* Peeks at the current byte and returns it as an integer.
*
* s = StringScanner.new('ab')
* s.peek_byte # => 97
*/
static VALUE
strscan_peek_byte(VALUE self)
{
struct strscanner *p;
GET_SCANNER(self, p);
if (EOS_P(p))
return Qnil;
return INT2FIX((unsigned char)*CURPTR(p));
}
/*
* Scans one byte and returns it.
* This method is not multibyte character sensitive.
@ -1605,6 +1656,7 @@ strscan_named_captures(VALUE self)
*
* - #getch
* - #get_byte
* - #scan_byte
* - #scan
* - #scan_until
* - #skip
@ -1617,6 +1669,7 @@ strscan_named_captures(VALUE self)
* - #exist?
* - #match?
* - #peek
* - #peek_byte
*
* === Finding Where we Are
*
@ -1708,7 +1761,9 @@ Init_strscan(void)
rb_define_method(StringScanner, "getch", strscan_getch, 0);
rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0);
rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0);
rb_define_method(StringScanner, "scan_byte", strscan_scan_byte, 0);
rb_define_method(StringScanner, "peek", strscan_peek, 1);
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
rb_define_method(StringScanner, "peep", strscan_peep, 1);
rb_define_method(StringScanner, "unscan", strscan_unscan, 0);

View File

@ -8,6 +8,29 @@ require 'strscan'
require 'test/unit'
module StringScannerTests
def test_peek_byte
s = create_string_scanner('ab')
assert_equal 97, s.peek_byte
assert_equal 97, s.scan_byte
assert_equal 98, s.peek_byte
assert_equal 98, s.scan_byte
assert_nil s.peek_byte
assert_nil s.scan_byte
end
def test_scan_byte
s = create_string_scanner('ab')
assert_equal 97, s.scan_byte
assert_equal 98, s.scan_byte
assert_nil s.scan_byte
str = "\244\242".dup.force_encoding("euc-jp")
s = StringScanner.new(str)
assert_equal str.getbyte(s.pos), s.scan_byte
assert_equal str.getbyte(s.pos), s.scan_byte
assert_nil s.scan_byte
end
def test_s_new
s = create_string_scanner('test string')
assert_instance_of StringScanner, s