diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 7a028f9e7859..94e3b69abf24 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -48,3 +48,12 @@ def isdigit(c: i32, /) -> bool: ... def isalnum(c: i32, /) -> bool: ... def isalpha(c: i32, /) -> bool: ... def isidentifier(c: i32, /) -> bool: ... + +# Codepoint case conversion. For the rare codepoints whose Unicode +# uppercase / lowercase expands to multiple codepoints (e.g. U+00DF +# uppercases to "SS", U+FB01 to "FI"), returns the input unchanged so +# the signature stays i32 -> i32. Use str.upper() / str.lower() for full +# Unicode case conversion when those cases matter. Negative inputs are +# returned unchanged. +def toupper(c: i32, /) -> i32: ... +def tolower(c: i32, /) -> i32: ... diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index d95d5afb4860..b79ffcd9f7c8 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -1191,6 +1191,18 @@ DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum) DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha) DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier) +#define DEFINE_CP_I32_WRAPPER(name, fn) \ + static PyObject* \ + cp_##name(PyObject *module, PyObject *arg) { \ + int32_t c; \ + if (cp_parse_i32(arg, &c) < 0) \ + return NULL; \ + return PyLong_FromLong((long) fn(c)); \ + } + +DEFINE_CP_I32_WRAPPER(toupper, LibRTStrings_ToUpper) +DEFINE_CP_I32_WRAPPER(tolower, LibRTStrings_ToLower) + static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, PyDoc_STR("Write a 16-bit signed integer to BytesWriter in little-endian format") @@ -1267,6 +1279,12 @@ static PyMethodDef librt_strings_module_methods[] = { {"isidentifier", cp_isidentifier, METH_O, PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).") }, + {"toupper", cp_toupper, METH_O, + PyDoc_STR("Single-codepoint uppercase mapping for a codepoint (i32). Returns the input unchanged if the Unicode uppercase expands to multiple codepoints (e.g. U+00DF uppercases to \"SS\"); use str.upper() for full Unicode case conversion.") + }, + {"tolower", cp_tolower, METH_O, + PyDoc_STR("Single-codepoint lowercase mapping for a codepoint (i32). Returns the input unchanged if the Unicode lowercase expands to multiple codepoints; use str.lower() for full Unicode case conversion.") + }, {NULL, NULL, 0, NULL} }; diff --git a/mypyc/lib-rt/strings/librt_strings.h b/mypyc/lib-rt/strings/librt_strings.h index c3cbd2f2237a..6c1942667ba4 100644 --- a/mypyc/lib-rt/strings/librt_strings.h +++ b/mypyc/lib-rt/strings/librt_strings.h @@ -73,4 +73,48 @@ static inline bool LibRTStrings_IsIdentifier(int32_t c) { return r == 1; } +// Shared slow path for LibRTStrings_ToUpper / _ToLower. Round-trips the +// codepoint through CPython's str.upper / str.lower on a 1-character +// string. When the conversion expands to multiple codepoints (e.g. +// 'ß'.upper() == 'SS') we return the input unchanged so the public +// helpers stay i32 -> i32. Aborts via CPyError_OutOfMemory on allocation +// failure. +static inline int32_t LibRTStrings_ChangeCase_slow(int32_t c, const char *method) { + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + CPyError_OutOfMemory(); + } + PyObject *u = PyObject_CallMethod(s, method, NULL); + Py_DECREF(s); + if (u == NULL) { + CPyError_OutOfMemory(); + } + int32_t result = c; + if (PyUnicode_GET_LENGTH(u) == 1) { + result = (int32_t)PyUnicode_READ_CHAR(u, 0); + } + Py_DECREF(u); + return result; +} + +// Uppercase a codepoint. ASCII fast path is `a..z -> A..Z` (subtract 32); +// non-ASCII delegates to str.upper on a 1-character string. Returns the +// input unchanged when uppercasing expands to multiple codepoints. +static inline int32_t LibRTStrings_ToUpper(int32_t c) { + if (c < 0) return c; + if (c >= 'a' && c <= 'z') return c - 32; + if (c < 128) return c; + return LibRTStrings_ChangeCase_slow(c, "upper"); +} + +// Lowercase a codepoint. ASCII fast path is `A..Z -> a..z` (add 32); +// non-ASCII delegates to str.lower on a 1-character string. Returns the +// input unchanged when lowercasing expands to multiple codepoints. +static inline int32_t LibRTStrings_ToLower(int32_t c) { + if (c < 0) return c; + if (c >= 'A' && c <= 'Z') return c + 32; + if (c < 128) return c; + return LibRTStrings_ChangeCase_slow(c, "lower"); +} + #endif // LIBRT_STRINGS_H diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index f025c6e95b71..f3fceb483f96 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -438,3 +438,26 @@ error_kind=ERR_NEVER, dependencies=[LIBRT_STRINGS], ) + +# Codepoint case conversion. When the Unicode uppercase/lowercase of a +# codepoint expands to multiple codepoints (e.g. U+00DF uppercases to "SS", +# U+FB01 to "FI"), returns the input unchanged so the signature stays +# i32 -> i32; callers needing full Unicode case conversion should use +# str.upper() / .lower() instead. Negative inputs are returned unchanged. +function_op( + name="librt.strings.toupper", + arg_types=[int32_rprimitive], + return_type=int32_rprimitive, + c_function_name="LibRTStrings_ToUpper", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS], +) + +function_op( + name="librt.strings.tolower", + arg_types=[int32_rprimitive], + return_type=int32_rprimitive, + c_function_name="LibRTStrings_ToLower", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS], +) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index e3aaa49bd6f9..83523cb2468e 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -401,3 +401,29 @@ def is_id(c): L0: r0 = LibRTStrings_IsIdentifier(c) return r0 + +[case testLibrtStringsToUpperIR] +from librt.strings import toupper +from mypy_extensions import i32 + +def up(c: i32) -> i32: + return toupper(c) +[out] +def up(c): + c, r0 :: i32 +L0: + r0 = LibRTStrings_ToUpper(c) + return r0 + +[case testLibrtStringsToLowerIR] +from librt.strings import tolower +from mypy_extensions import i32 + +def lo(c: i32) -> i32: + return tolower(c) +[out] +def lo(c): + c, r0 :: i32 +L0: + r0 = LibRTStrings_ToLower(c) + return r0 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 0a3320ff6522..7efff12667d8 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1490,3 +1490,52 @@ def test_codepoint_classifiers_via_any() -> None: f(1 << 40) with assertRaises(OverflowError, "codepoint out of i32 range"): f(-(1 << 40)) + +[case testLibrtStringsCodepointCaseConversion_librt] +from typing import Any +from mypy_extensions import i32 +from librt.strings import toupper, tolower + +from testutil import assertRaises + + +def _expect(c: str, method: str) -> int: + # The contract: i32 -> i32 when conversion yields exactly one codepoint, + # else return the input unchanged. + converted = getattr(c, method)() + if len(converted) == 1: + return ord(converted) + return ord(c) + + +def test_codepoint_case_conversion() -> None: + # Negative inputs return unchanged. + for bad in (i32(-1), i32(-113)): + assert toupper(bad) == bad + assert tolower(bad) == bad + # Agree with str.upper / str.lower across the full Unicode range + # whenever the conversion is single-codepoint; otherwise return input. + for i in range(0x110000): + c = chr(i) + o = ord(c) + assert toupper(o) == _expect(c, "upper") + assert tolower(o) == _expect(c, "lower") + + +def test_codepoint_case_conversion_via_any() -> None: + # Iterate to force generic dispatch through the PyMethodDef wrapper. + for fn, in_cp, out_cp in ( + (toupper, ord("a"), ord("A")), + (toupper, ord("A"), ord("A")), + (tolower, ord("Z"), ord("z")), + (tolower, ord("z"), ord("z")), + ): + f: Any = fn + assert f(in_cp) == out_cp + # Negative values are valid i32, returned unchanged. + assert f(-1) == -1 + # Inputs outside i32 range raise OverflowError through the wrapper. + with assertRaises(OverflowError, "codepoint out of i32 range"): + f(1 << 40) + with assertRaises(OverflowError, "codepoint out of i32 range"): + f(-(1 << 40))