Skip to content

Commit 2d0a381

Browse files
committed
Issue #139123: Reimplement base UUID type, uuid4(), and uuid7() in C
The C implementation considerably boosts the performance of the key UUID operations: ------------------------------------ Operation Speedup ------------------------------------ uuid4() generation 15.01x uuid7() generation 29.64x UUID from string 6.76x UUID from bytes 5.16x str(uuid) conversion 6.66x ------------------------------------ Summary of changes: * The UUID type is reimplemented in C in its entirety. * The pure-Python is kept around and is used of the C implementation isn't available for some reason. * Both implementations are tested extensively; additional tests are added to ensure that the C implementation of the type follows the pure Python implementation fully. * The Python implementation stores UUID values as int objects. The C implementation stores them as `uint8_t[16]` array. * The C implementation supports unpickling of UUIDs created with Python 2 using protocols starting with 0. That necessitated a small fix to the `copyreg` module (the change is only affecting legacy pickle pathway). * The C implementation has faster hash() implementation but also caches the computed hash value to speedup cases when UUIDs are used as set/dict keys. * The C implementation has a freelist to make new UUID object instantiation as fast as possible. * uuid4() and uuid7() are now implmented in C. The most performance boost (10x) comes from overfetching entropy to decrease the number of _PyOS_URandom() calls. On its own it's a safe optimization with the edge case that Unix fork needs to be explicitly handled. We do that by comparing the current PID to the PID of when the random buffer was populated. * Portions of code are coming from my implementation of faster UUID in gel-python [1]. I did use AI during the development, but basically had to rewrite the code it generated to be more idiomatic and efficient. * The benchmark can be found here [2]. * This PR makes Python UUID operations as fast as they are in NodeJS and Bun runtimes. [1] https://github.com/MagicStack/py-pgproto/blob/b8109fb311a59f30f9947567a13508da9a776564/uuid.pyx [2] https://gist.github.com/1st1/f03e816f34a61e4d46c78ff98baf4818
1 parent d6a6fe2 commit 2d0a381

File tree

7 files changed

+2171
-61
lines changed

7 files changed

+2171
-61
lines changed

Include/internal/pycore_pylifecycle.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ extern const char* _Py_gitversion(void);
9898
// Export for '_asyncio' shared extension
9999
PyAPI_FUNC(int) _Py_IsInterpreterFinalizing(PyInterpreterState *interp);
100100

101-
/* Random */
102-
extern int _PyOS_URandom(void *buffer, Py_ssize_t size);
101+
// Export for '_uuid' shared extension
102+
PyAPI_FUNC(int) _PyOS_URandom(void *buffer, Py_ssize_t size);
103103

104104
// Export for '_random' shared extension
105105
PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size);

Lib/copyreg.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,13 @@ def pickle_super(obj):
4444
# Support for pickling new-style objects
4545

4646
def _reconstructor(cls, base, state):
47-
if base is object:
47+
if cls.__module__ == '_uuid' and cls.__name__ == 'UUID' and base is object:
48+
# Compatibility with ancient pickled data -- prior to uuid rewritten in C
49+
# object.__new__ would work, but it can't work anymore.
50+
obj = cls.__new__(cls)
51+
return obj
52+
53+
if base is object and cls.__name__ != 'UUID':
4854
obj = object.__new__(cls)
4955
else:
5056
obj = base.__new__(cls, state)

Lib/test/test_uuid.py

Lines changed: 110 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def get_command_stdout(command, args):
3535

3636
class BaseTestUUID:
3737
uuid = None
38+
is_c_uuid = False
3839

3940
def test_nil_uuid(self):
4041
nil_uuid = self.uuid.NIL
@@ -282,14 +283,16 @@ def test_exceptions(self):
282283
badvalue(lambda: self.uuid.UUID('123456781234567812345678z2345678'))
283284

284285
# Badly formed bytes.
285-
badvalue(lambda: self.uuid.UUID(bytes='abc'))
286-
badvalue(lambda: self.uuid.UUID(bytes='\0'*15))
287-
badvalue(lambda: self.uuid.UUID(bytes='\0'*17))
286+
badtype(lambda: self.uuid.UUID(bytes='abc'))
287+
badvalue(lambda: self.uuid.UUID(bytes=b'abc'))
288+
badvalue(lambda: self.uuid.UUID(bytes=b'\0'*15))
289+
badvalue(lambda: self.uuid.UUID(bytes=b'\0'*17))
288290

289291
# Badly formed bytes_le.
290-
badvalue(lambda: self.uuid.UUID(bytes_le='abc'))
291-
badvalue(lambda: self.uuid.UUID(bytes_le='\0'*15))
292-
badvalue(lambda: self.uuid.UUID(bytes_le='\0'*17))
292+
badtype(lambda: self.uuid.UUID(bytes_le='abc'))
293+
badvalue(lambda: self.uuid.UUID(bytes_le=b'abc'))
294+
badvalue(lambda: self.uuid.UUID(bytes_le=b'\0'*15))
295+
badvalue(lambda: self.uuid.UUID(bytes_le=b'\0'*17))
293296

294297
# Badly formed fields.
295298
badvalue(lambda: self.uuid.UUID(fields=(1,)))
@@ -877,12 +880,18 @@ def test_uuid6_test_vectors(self):
877880
equal((u.int >> 80) & 0xffff, 0x232a)
878881
equal((u.int >> 96) & 0xffff_ffff, 0x1ec9_414c)
879882

880-
def test_uuid7(self):
883+
def test_uuid7_functional(self):
881884
equal = self.assertEqual
882885
u = self.uuid.uuid7()
883886
equal(u.variant, self.uuid.RFC_4122)
884887
equal(u.version, 7)
885888

889+
def test_uuid7_mock(self):
890+
if self.is_c_uuid:
891+
self.skipTest("C implementation of uuid7 cannot be tested with mocks")
892+
893+
equal = self.assertEqual
894+
886895
# 1 Jan 2023 12:34:56.123_456_789
887896
timestamp_ns = 1672533296_123_456_789 # ns precision
888897
timestamp_ms, _ = divmod(timestamp_ns, 1_000_000)
@@ -940,7 +949,15 @@ def test_uuid7_uniqueness(self):
940949
versions = {u.version for u in uuids}
941950
self.assertSetEqual(versions, {7})
942951

943-
def test_uuid7_monotonicity(self):
952+
def test_uuid7_monotonicity_functional(self):
953+
equal = self.assertEqual
954+
us = [self.uuid.uuid7() for _ in range(10_000)]
955+
equal(us, sorted(us))
956+
957+
def test_uuid7_monotonicity_mock(self):
958+
if self.is_c_uuid:
959+
self.skipTest("C implementation of uuid7 cannot be tested with mocks")
960+
944961
equal = self.assertEqual
945962

946963
us = [self.uuid.uuid7() for _ in range(10_000)]
@@ -1003,7 +1020,10 @@ def test_uuid7_monotonicity(self):
10031020

10041021
self.assertLess(u1, u2)
10051022

1006-
def test_uuid7_timestamp_backwards(self):
1023+
def test_uuid7_timestamp_backwards_mock(self):
1024+
if self.is_c_uuid:
1025+
self.skipTest("C implementation of uuid7 cannot be tested with mocks")
1026+
10071027
equal = self.assertEqual
10081028
# 1 Jan 2023 12:34:56.123_456_789
10091029
timestamp_ns = 1672533296_123_456_789 # ns precision
@@ -1043,7 +1063,10 @@ def test_uuid7_timestamp_backwards(self):
10431063
equal((u.int >> 32) & 0x3fff_ffff, counter_lo + 1)
10441064
equal(u.int & 0xffff_ffff, tail)
10451065

1046-
def test_uuid7_overflow_counter(self):
1066+
def test_uuid7_overflow_counter_mock(self):
1067+
if self.is_c_uuid:
1068+
self.skipTest("C implementation of uuid7 cannot be tested with mocks")
1069+
10471070
equal = self.assertEqual
10481071
# 1 Jan 2023 12:34:56.123_456_789
10491072
timestamp_ns = 1672533296_123_456_789 # ns precision
@@ -1149,6 +1172,7 @@ def test_uuid_weakref(self):
11491172

11501173
class CommandLineTestCases:
11511174
uuid = None # to be defined in subclasses
1175+
is_c_uuid = False
11521176

11531177
def do_test_standalone_uuid(self, version):
11541178
stdout = io.StringIO()
@@ -1257,6 +1281,7 @@ class TestUUIDWithoutExtModule(CommandLineTestCases, BaseTestUUID, unittest.Test
12571281
@unittest.skipUnless(c_uuid, 'requires the C _uuid module')
12581282
class TestUUIDWithExtModule(CommandLineTestCases, BaseTestUUID, unittest.TestCase):
12591283
uuid = c_uuid
1284+
is_c_uuid = True
12601285

12611286
def check_has_stable_libuuid_extractable_node(self):
12621287
if not self.uuid._has_stable_extractable_node:
@@ -1287,6 +1312,7 @@ def test_windows_getnode_from_libuuid(self):
12871312

12881313
class BaseTestInternals:
12891314
_uuid = py_uuid
1315+
is_c_uuid = False
12901316

12911317
def check_parse_mac(self, aix):
12921318
if not aix:
@@ -1480,6 +1506,7 @@ class TestInternalsWithoutExtModule(BaseTestInternals, unittest.TestCase):
14801506
@unittest.skipUnless(c_uuid, 'requires the C _uuid module')
14811507
class TestInternalsWithExtModule(BaseTestInternals, unittest.TestCase):
14821508
uuid = c_uuid
1509+
is_c_uuid = True
14831510

14841511
@unittest.skipUnless(os.name == 'posix', 'requires Posix')
14851512
def test_unix_getnode(self):
@@ -1497,5 +1524,78 @@ def test_windll_getnode(self):
14971524
self.check_node(node)
14981525

14991526

1527+
@unittest.skipUnless(c_uuid, "requires the C _uuid module")
1528+
class TestCImplementationCompat(unittest.TestCase):
1529+
def test_compatibility(self):
1530+
import uuid
1531+
1532+
PU = uuid._py_UUID
1533+
CU = uuid._c_UUID
1534+
N = 1000
1535+
1536+
uuids = [
1537+
"00000000-0000-0000-0000-000000000000",
1538+
"ffffffff-ffff-ffff-ffff-ffffffffffff",
1539+
"c0bec4fd-e4e3-050c-a362-da3f734ffd56", # regression
1540+
*(str(uuid.uuid4()) for _ in range(N)),
1541+
*(str(uuid.uuid7()) for _ in range(N)),
1542+
*(str(uuid.uuid1()) for _ in range(N)),
1543+
*(str(uuid.UUID(bytes=os.urandom(16))) for _ in range(N)),
1544+
]
1545+
1546+
def full_test(p, u):
1547+
self.assertEqual(p, u)
1548+
self.assertEqual(p.hex, u.hex)
1549+
self.assertEqual(p.int, u.int)
1550+
self.assertEqual(p.variant, u.variant)
1551+
self.assertEqual(p.version, u.version)
1552+
self.assertEqual(p.is_safe, u.is_safe)
1553+
self.assertEqual(p.bytes, u.bytes)
1554+
self.assertEqual(p.bytes_le, u.bytes_le)
1555+
self.assertEqual(p.fields, u.fields)
1556+
self.assertEqual(p.time_low, u.time_low)
1557+
self.assertEqual(p.time_mid, u.time_mid)
1558+
self.assertEqual(p.time_hi_version, u.time_hi_version)
1559+
self.assertEqual(p.clock_seq_hi_variant, u.clock_seq_hi_variant)
1560+
self.assertEqual(p.clock_seq_low, u.clock_seq_low)
1561+
self.assertEqual(p.node, u.node)
1562+
1563+
all_ps = set()
1564+
all_us = set()
1565+
for uuid_str in uuids:
1566+
with self.subTest(uuid=uuid_str):
1567+
p = PU(uuid_str)
1568+
u = CU(uuid_str)
1569+
full_test(p, u)
1570+
1571+
u2 = CU(bytes_le=p.bytes_le)
1572+
full_test(p, u2)
1573+
1574+
u3 = CU(fields=p.fields)
1575+
full_test(p, u3)
1576+
1577+
u4 = CU(int=p.int)
1578+
full_test(p, u4)
1579+
1580+
u5 = CU(
1581+
hex=p.hex,
1582+
is_safe=uuid.SafeUUID.safe,
1583+
)
1584+
full_test(
1585+
PU(
1586+
uuid_str,
1587+
is_safe=uuid.SafeUUID.safe,
1588+
),
1589+
u5,
1590+
)
1591+
1592+
all_ps.add(p)
1593+
all_us.add(u)
1594+
1595+
self.assertEqual(len(all_ps), len(all_us))
1596+
self.assertEqual(len(all_ps), len(uuids))
1597+
1598+
1599+
15001600
if __name__ == '__main__':
15011601
unittest.main()

Lib/uuid.py

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class SafeUUID:
9999

100100

101101
_UINT_128_MAX = (1 << 128) - 1
102+
102103
# 128-bit mask to clear the variant and version bits of a UUID integral value
103104
_RFC_4122_CLEARFLAGS_MASK = ~((0xf000 << 64) | (0xc000 << 48))
104105
# RFC 4122 variant bits and version bits to activate on a UUID integral value.
@@ -111,6 +112,28 @@ class SafeUUID:
111112
_RFC_4122_VERSION_8_FLAGS = ((8 << 76) | (0x8000 << 48))
112113

113114

115+
def _unpickle(state):
116+
"""Internal function to unpickle a UUID from a state dictionary."""
117+
# This is also used by the C extension module for pickle compatibility
118+
# with protocols 0 & 1.
119+
obj = UUID.__new__(UUID)
120+
obj.__setstate__(state)
121+
return obj
122+
123+
124+
# Import optional C extension at toplevel, to help disabling it when testing
125+
try:
126+
import _uuid
127+
_generate_time_safe = getattr(_uuid, "generate_time_safe", None)
128+
_has_stable_extractable_node = _uuid.has_stable_extractable_node
129+
_UuidCreate = getattr(_uuid, "UuidCreate", None)
130+
except ImportError:
131+
_uuid = None
132+
_generate_time_safe = None
133+
_has_stable_extractable_node = False
134+
_UuidCreate = None
135+
136+
114137
class UUID:
115138
"""Instances of the UUID class represent UUIDs as specified in RFC 4122.
116139
UUID objects are immutable, hashable, and usable as dictionary keys.
@@ -219,13 +242,21 @@ def __init__(self, hex=None, bytes=None, bytes_le=None, fields=None,
219242
raise ValueError('badly formed hexadecimal UUID string')
220243
int = int_(hex, 16)
221244
elif bytes_le is not None:
245+
if not isinstance(bytes_le, bytes_):
246+
raise TypeError(
247+
f'a bytes-like object is required, not {type(bytes_le).__name__!r}'
248+
)
222249
if len(bytes_le) != 16:
223250
raise ValueError('bytes_le is not a 16-char string')
224251
assert isinstance(bytes_le, bytes_), repr(bytes_le)
225252
bytes = (bytes_le[4-1::-1] + bytes_le[6-1:4-1:-1] +
226253
bytes_le[8-1:6-1:-1] + bytes_le[8:])
227254
int = int_.from_bytes(bytes) # big endian
228255
elif bytes is not None:
256+
if not isinstance(bytes, bytes_):
257+
raise TypeError(
258+
f'a bytes-like object is required, not {type(bytes).__name__!r}'
259+
)
229260
if len(bytes) != 16:
230261
raise ValueError('bytes is not a 16-char string')
231262
assert isinstance(bytes, bytes_), repr(bytes)
@@ -234,7 +265,7 @@ def __init__(self, hex=None, bytes=None, bytes_le=None, fields=None,
234265
if len(fields) != 6:
235266
raise ValueError('fields is not a 6-tuple')
236267
(time_low, time_mid, time_hi_version,
237-
clock_seq_hi_variant, clock_seq_low, node) = fields
268+
clock_seq_hi_variant, clock_seq_low, node) = fields
238269
if not 0 <= time_low < (1 << 32):
239270
raise ValueError('field 1 out of range (need a 32-bit value)')
240271
if not 0 <= time_mid < (1 << 16):
@@ -249,7 +280,7 @@ def __init__(self, hex=None, bytes=None, bytes_le=None, fields=None,
249280
raise ValueError('field 6 out of range (need a 48-bit value)')
250281
clock_seq = (clock_seq_hi_variant << 8) | clock_seq_low
251282
int = ((time_low << 96) | (time_mid << 80) |
252-
(time_hi_version << 64) | (clock_seq << 48) | node)
283+
(time_hi_version << 64) | (clock_seq << 48) | node)
253284
if not 0 <= int <= _UINT_128_MAX:
254285
raise ValueError('int is out of range (need a 128-bit value)')
255286
if version is not None:
@@ -273,6 +304,11 @@ def _from_int(cls, value):
273304
object.__setattr__(self, 'is_safe', SafeUUID.unknown)
274305
return self
275306

307+
def __reduce_ex__(self, protocol):
308+
# Primarily we define __reduce_ex__ to make the C implementation
309+
# compatible with protocols 0 & 1.
310+
return _unpickle, (self.__getstate__(),)
311+
276312
def __getstate__(self):
277313
d = {'int': self.int}
278314
if self.is_safe != SafeUUID.unknown:
@@ -629,19 +665,6 @@ def _netstat_getnode():
629665
return _find_mac_under_heading('netstat', '-ian', b'Address')
630666

631667

632-
# Import optional C extension at toplevel, to help disabling it when testing
633-
try:
634-
import _uuid
635-
_generate_time_safe = getattr(_uuid, "generate_time_safe", None)
636-
_has_stable_extractable_node = _uuid.has_stable_extractable_node
637-
_UuidCreate = getattr(_uuid, "UuidCreate", None)
638-
except ImportError:
639-
_uuid = None
640-
_generate_time_safe = None
641-
_has_stable_extractable_node = False
642-
_UuidCreate = None
643-
644-
645668
def _unix_getnode():
646669
"""Get the hardware address on Unix using the _uuid extension module."""
647670
if _generate_time_safe and _has_stable_extractable_node:
@@ -932,6 +955,20 @@ def uuid8(a=None, b=None, c=None):
932955
return UUID._from_int(int_uuid_8)
933956

934957

958+
_py_uuid4 = uuid4
959+
_py_uuid7 = uuid7
960+
_py_UUID = UUID
961+
try:
962+
from _uuid import UUID, uuid4, uuid7
963+
except ImportError:
964+
_c_UUID = None
965+
_c_uuid4 = None
966+
_c_uuid7 = None
967+
else:
968+
_c_UUID = UUID
969+
_c_uuid4 = uuid4
970+
_c_uuid7 = uuid7
971+
935972
def main():
936973
"""Run the uuid command line interface."""
937974
uuid_funcs = {

0 commit comments

Comments
 (0)