gh-139871: Optimize bytearray construction with encoding (#142243)

When a `str` is encoded in `bytearray.__init__` the encoder tends to
create a new unique bytes object. Rather than allocate new memory and
copy the bytes use the already created bytes object as bytearray
backing. The bigger the `str` the bigger the saving.

Mean +- std dev: [main_encoding] 497 us +- 9 us -> [encoding] 14.2 us +- 0.3 us: 34.97x faster

```python
import pyperf

runner = pyperf.Runner()

runner.timeit(
    name="encode",
    setup="a = 'a' * 1_000_000",
    stmt="bytearray(a, encoding='utf8')")
```
This commit is contained in:
Cody Maloney 2025-12-15 04:10:31 -08:00 committed by GitHub
parent 850f95f6f6
commit 14e6052b43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -914,6 +914,10 @@ bytearray___init___impl(PyByteArrayObject *self, PyObject *arg,
return -1;
}
/* Should be caused by first init or the resize to 0. */
assert(self->ob_bytes_object == Py_GetConstantBorrowed(Py_CONSTANT_EMPTY_BYTES));
assert(self->ob_exports == 0);
/* Make a quick exit if no first argument */
if (arg == NULL) {
if (encoding != NULL || errors != NULL) {
@ -935,9 +939,20 @@ bytearray___init___impl(PyByteArrayObject *self, PyObject *arg,
return -1;
}
encoded = PyUnicode_AsEncodedString(arg, encoding, errors);
if (encoded == NULL)
if (encoded == NULL) {
return -1;
}
assert(PyBytes_Check(encoded));
/* Most encodes return a new unique bytes, just use it as buffer. */
if (_PyObject_IsUniquelyReferenced(encoded)
&& PyBytes_CheckExact(encoded))
{
Py_ssize_t size = Py_SIZE(encoded);
self->ob_bytes_object = encoded;
bytearray_reinit_from_bytes(self, size, size);
return 0;
}
new = bytearray_iconcat((PyObject*)self, encoded);
Py_DECREF(encoded);
if (new == NULL)