diff --git a/docs/advanced/cast/overview.rst b/docs/advanced/cast/overview.rst index ab37b90b..54c11a90 100644 --- a/docs/advanced/cast/overview.rst +++ b/docs/advanced/cast/overview.rst @@ -94,14 +94,26 @@ as arguments and return values, refer to the section on binding :ref:`classes`. +------------------------------------+---------------------------+-------------------------------+ | ``char`` | Character literal | :file:`pybind11/pybind11.h` | +------------------------------------+---------------------------+-------------------------------+ +| ``char16_t`` | UTF-16 character literal | :file:`pybind11/pybind11.h` | ++------------------------------------+---------------------------+-------------------------------+ +| ``char32_t`` | UTF-32 character literal | :file:`pybind11/pybind11.h` | ++------------------------------------+---------------------------+-------------------------------+ | ``wchar_t`` | Wide character literal | :file:`pybind11/pybind11.h` | +------------------------------------+---------------------------+-------------------------------+ | ``const char *`` | UTF-8 string literal | :file:`pybind11/pybind11.h` | +------------------------------------+---------------------------+-------------------------------+ +| ``const char16_t *`` | UTF-16 string literal | :file:`pybind11/pybind11.h` | ++------------------------------------+---------------------------+-------------------------------+ +| ``const char32_t *`` | UTF-32 string literal | :file:`pybind11/pybind11.h` | ++------------------------------------+---------------------------+-------------------------------+ | ``const wchar_t *`` | Wide string literal | :file:`pybind11/pybind11.h` | +------------------------------------+---------------------------+-------------------------------+ | ``std::string`` | STL dynamic UTF-8 string | :file:`pybind11/pybind11.h` | +------------------------------------+---------------------------+-------------------------------+ +| ``std::u16string`` | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h` | ++------------------------------------+---------------------------+-------------------------------+ +| ``std::u32string`` | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h` | ++------------------------------------+---------------------------+-------------------------------+ | ``std::wstring`` | STL dynamic wide string | :file:`pybind11/pybind11.h` | +------------------------------------+---------------------------+-------------------------------+ | ``std::pair`` | Pair of two custom types | :file:`pybind11/pybind11.h` | diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 44c961c7..cfc6f8b7 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -471,8 +471,15 @@ public: template using cast_op_type = pybind11::detail::cast_op_type<_T> +template using is_std_char_type = any_of< + std::is_same, /* std::string */ + std::is_same, /* std::u16string */ + std::is_same, /* std::u32string */ + std::is_same /* std::wstring */ +>; + template -struct type_caster::value>> { +struct type_caster::value && !is_std_char_type::value>> { using _py_type_0 = conditional_t; using _py_type_1 = conditional_t::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>; using py_type = conditional_t::value, double, _py_type_1>; @@ -617,122 +624,144 @@ public: PYBIND11_TYPE_CASTER(bool, _("bool")); }; -template <> class type_caster { -public: - bool load(handle src, bool) { - object temp; - handle load_src = src; - if (!src) { - return false; - } else if (PyUnicode_Check(load_src.ptr())) { - temp = reinterpret_steal(PyUnicode_AsUTF8String(load_src.ptr())); - if (!temp) { PyErr_Clear(); return false; } // UnicodeEncodeError - load_src = temp; - } - char *buffer; - ssize_t length; - int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(load_src.ptr(), &buffer, &length); - if (err == -1) { PyErr_Clear(); return false; } // TypeError - value = std::string(buffer, (size_t) length); - success = true; - return true; - } - - static handle cast(const std::string &src, return_value_policy /* policy */, handle /* parent */) { - return PyUnicode_FromStringAndSize(src.c_str(), (ssize_t) src.length()); - } - - PYBIND11_TYPE_CASTER(std::string, _(PYBIND11_STRING_NAME)); -protected: - bool success = false; -}; - -template <> class type_caster { -public: +// Helper class for UTF-{8,16,32} C++ stl strings: +template +struct type_caster, enable_if_t::value>> { + // Simplify life by being able to assume standard char sizes (the standard only guarantees + // minimums), but Python requires exact sizes + static_assert(!std::is_same::value || sizeof(CharT) == 1, "Unsupported char size != 1"); + static_assert(!std::is_same::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2"); + static_assert(!std::is_same::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4"); + // wchar_t can be either 16 bits (Windows) or 32 (everywhere else) + static_assert(!std::is_same::value || sizeof(CharT) == 2 || sizeof(CharT) == 4, + "Unsupported wchar_t size != 2/4"); + static constexpr size_t UTF_N = 8 * sizeof(CharT); + static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32"; + + using StringType = std::basic_string; + bool load(handle src, bool) { +#if PY_VERSION_MAJOR < 3 object temp; +#endif handle load_src = src; if (!src) { return false; } else if (!PyUnicode_Check(load_src.ptr())) { +#if PY_VERSION_MAJOR >= 3 + return false; + // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false +#else temp = reinterpret_steal(PyUnicode_FromObject(load_src.ptr())); if (!temp) { PyErr_Clear(); return false; } load_src = temp; - } - wchar_t *buffer = nullptr; - ssize_t length = -1; -#if PY_MAJOR_VERSION >= 3 - buffer = PyUnicode_AsWideCharString(load_src.ptr(), &length); -#else - temp = reinterpret_steal(PyUnicode_AsEncodedString( - load_src.ptr(), sizeof(wchar_t) == sizeof(short) - ? "utf16" : "utf32", nullptr)); - - if (temp) { - int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), (char **) &buffer, &length); - if (err == -1) { buffer = nullptr; } // TypeError - length = length / (ssize_t) sizeof(wchar_t) - 1; ++buffer; // Skip BOM - } #endif - if (!buffer) { PyErr_Clear(); return false; } - value = std::wstring(buffer, (size_t) length); - success = true; + } + + object utfNbytes = reinterpret_steal(PyUnicode_AsEncodedString( + load_src.ptr(), encoding, nullptr)); + if (!utfNbytes) { PyErr_Clear(); return false; } + + const CharT *buffer = reinterpret_cast(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr())); + size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT); + if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32 + value = StringType(buffer, length); return true; } - static handle cast(const std::wstring &src, return_value_policy /* policy */, handle /* parent */) { - return PyUnicode_FromWideChar(src.c_str(), (ssize_t) src.length()); + static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) { + const char *buffer = reinterpret_cast(src.c_str()); + ssize_t nbytes = ssize_t(src.size() * sizeof(CharT)); + handle s = PyUnicode_Decode(buffer, nbytes, encoding, nullptr); + if (!s) throw error_already_set(); + return s; } - PYBIND11_TYPE_CASTER(std::wstring, _(PYBIND11_STRING_NAME)); -protected: - bool success = false; + PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME)); }; -template <> class type_caster : public type_caster { +// Type caster for C-style strings. We basically use a std::string type caster, but also add the +// ability to use None as a nullptr char* (which the string caster doesn't allow). +template struct type_caster::value>> { + using StringType = std::basic_string; + using StringCaster = type_caster; + StringCaster str_caster; + bool none = false; public: bool load(handle src, bool convert) { - if (src.is_none()) return true; - return type_caster::load(src, convert); + if (!src) return false; + if (src.is_none()) { + // Defer accepting None to other overloads (if we aren't in convert mode): + if (!convert) return false; + none = true; + return true; + } + return str_caster.load(src, convert); } - static handle cast(const char *src, return_value_policy /* policy */, handle /* parent */) { - if (src == nullptr) return none().inc_ref(); - return PyUnicode_FromString(src); + static handle cast(const CharT *src, return_value_policy policy, handle parent) { + if (src == nullptr) return pybind11::none().inc_ref(); + return StringCaster::cast(StringType(src), policy, parent); } - static handle cast(char src, return_value_policy /* policy */, handle /* parent */) { - char str[2] = { src, '\0' }; - return PyUnicode_DecodeLatin1(str, 1, nullptr); + static handle cast(CharT src, return_value_policy policy, handle parent) { + if (std::is_same::value) { + handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr); + if (!s) throw error_already_set(); + return s; + } + return StringCaster::cast(StringType(1, src), policy, parent); } - operator char*() { return success ? const_cast(value.c_str()) : nullptr; } - operator char&() { return value[0]; } - - static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); } -}; - -template <> class type_caster : public type_caster { -public: - bool load(handle src, bool convert) { - if (src.is_none()) return true; - return type_caster::load(src, convert); - } - - static handle cast(const wchar_t *src, return_value_policy /* policy */, handle /* parent */) { - if (src == nullptr) return none().inc_ref(); - return PyUnicode_FromWideChar(src, (ssize_t) wcslen(src)); - } - - static handle cast(wchar_t src, return_value_policy /* policy */, handle /* parent */) { - wchar_t wstr[2] = { src, L'\0' }; - return PyUnicode_FromWideChar(wstr, 1); - } - - operator wchar_t*() { return success ? const_cast(value.c_str()) : nullptr; } - operator wchar_t&() { return value[0]; } + operator CharT*() { return none ? nullptr : const_cast(static_cast(str_caster).c_str()); } + operator CharT() { + if (none) + throw value_error("Cannot convert None to a character"); + + auto &value = static_cast(str_caster); + size_t str_len = value.size(); + if (str_len == 0) + throw value_error("Cannot convert empty string to a character"); + + // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that + // is too high, and one for multiple unicode characters (caught later), so we need to figure + // out how long the first encoded character is in bytes to distinguish between these two + // errors. We also allow want to allow unicode characters U+0080 through U+00FF, as those + // can fit into a single char value. + if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) { + unsigned char v0 = static_cast(value[0]); + size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127 + (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence + (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence + 4; // 0b11110xxx - start of 4-byte sequence + + if (char0_bytes == str_len) { + // If we have a 128-255 value, we can decode it into a single char: + if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx + return static_cast(((v0 & 3) << 6) + (static_cast(value[1]) & 0x3F)); + } + // Otherwise we have a single character, but it's > U+00FF + throw value_error("Character code point not in range(0x100)"); + } + } + + // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a + // surrogate pair with total length 2 instantly indicates a range error (but not a "your + // string was too long" error). + else if (StringCaster::UTF_N == 16 && str_len == 2) { + char16_t v0 = static_cast(value[0]); + if (v0 >= 0xD800 && v0 < 0xE000) + throw value_error("Character code point not in range(0x10000)"); + } + + if (str_len != 1) + throw value_error("Expected a character, but multi-character string found"); + + return value[0]; + } static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); } + template using cast_op_type = typename std::remove_reference>::type; }; template class type_caster> { diff --git a/include/pybind11/common.h b/include/pybind11/common.h index 7299fbf8..fb8504cc 100644 --- a/include/pybind11/common.h +++ b/include/pybind11/common.h @@ -111,6 +111,7 @@ #define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize #define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize #define PYBIND11_BYTES_AS_STRING PyBytes_AsString +#define PYBIND11_BYTES_SIZE PyBytes_Size #define PYBIND11_LONG_CHECK(o) PyLong_Check(o) #define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o) #define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) PyLong_AsUnsignedLongLong(o) @@ -129,6 +130,7 @@ #define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize #define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize #define PYBIND11_BYTES_AS_STRING PyString_AsString +#define PYBIND11_BYTES_SIZE PyString_Size #define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o)) #define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o)) #define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) (PyInt_Check(o) ? (unsigned long long) PyLong_AsUnsignedLong(o) : PyLong_AsUnsignedLongLong(o)) diff --git a/tests/test_python_types.cpp b/tests/test_python_types.cpp index e1598e9e..99a3cb9a 100644 --- a/tests/test_python_types.cpp +++ b/tests/test_python_types.cpp @@ -17,6 +17,11 @@ # include #endif +#if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant +#endif + class ExamplePythonTypes { public: static ExamplePythonTypes *new_instance() { @@ -426,4 +431,41 @@ test_initializer python_types([](py::module &m) { "l"_a=l ); }); + + // Some test characters in utf16 and utf32 encodings. The last one (the 𝐀) contains a null byte + char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /*β€½*/, cake32 = 0x1f382 /*πŸŽ‚*/, mathbfA32 = 0x1d400 /*𝐀*/; + char16_t b16 = 0x62 /*b*/, z16 = 0x7a, ib16 = 0x203d, cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00; + std::wstring wstr; + wstr.push_back(0x61); // a + wstr.push_back(0x2e18); // ⸘ + if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16 + else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32 + wstr.push_back(0x7a); // z + + m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8β€½ πŸŽ‚ 𝐀 + m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // bβ€½πŸŽ‚π€z + m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // aπ€πŸŽ‚β€½z + m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z + m.def("bad_utf8_string", []() { return std::string("abc\xd0" "def"); }); + m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); }); + // Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError + if (PY_MAJOR_VERSION >= 3) + m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); }); + if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2) + m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); }); + m.def("u8_Z", []() -> char { return 'Z'; }); + m.def("u8_eacute", []() -> char { return '\xe9'; }); + m.def("u16_ibang", [=]() -> char16_t { return ib16; }); + m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; }); + m.def("wchar_heart", []() -> wchar_t { return 0x2665; }); + + m.attr("wchar_size") = py::cast(sizeof(wchar_t)); + m.def("ord_char", [](char c) -> int { return static_cast(c); }); + m.def("ord_char16", [](char16_t c) -> uint16_t { return c; }); + m.def("ord_char32", [](char32_t c) -> uint32_t { return c; }); + m.def("ord_wchar", [](wchar_t c) -> int { return c; }); }); + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/tests/test_python_types.py b/tests/test_python_types.py index cb28e1ff..c5ade90b 100644 --- a/tests/test_python_types.py +++ b/tests/test_python_types.py @@ -1,3 +1,4 @@ +# Python < 3 needs this: coding=utf-8 import pytest from pybind11_tests import ExamplePythonTypes, ConstructorStats, has_optional, has_exp_optional @@ -410,3 +411,93 @@ def test_implicit_casting(): 'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44 } assert z['l'] == [3, 6, 9, 12, 15] + + +def test_unicode_conversion(): + """Tests unicode conversion and error reporting.""" + import pybind11_tests + from pybind11_tests import (good_utf8_string, bad_utf8_string, + good_utf16_string, bad_utf16_string, + good_utf32_string, # bad_utf32_string, + good_wchar_string, # bad_wchar_string, + u8_Z, u8_eacute, u16_ibang, u32_mathbfA, wchar_heart) + + assert good_utf8_string() == u"Say utf8β€½ πŸŽ‚ 𝐀" + assert good_utf16_string() == u"bβ€½πŸŽ‚π€z" + assert good_utf32_string() == u"aπ€πŸŽ‚β€½z" + assert good_wchar_string() == u"aβΈ˜π€z" + + with pytest.raises(UnicodeDecodeError): + bad_utf8_string() + + with pytest.raises(UnicodeDecodeError): + bad_utf16_string() + + # These are provided only if they actually fail (they don't when 32-bit and under Python 2.7) + if hasattr(pybind11_tests, "bad_utf32_string"): + with pytest.raises(UnicodeDecodeError): + pybind11_tests.bad_utf32_string() + if hasattr(pybind11_tests, "bad_wchar_string"): + with pytest.raises(UnicodeDecodeError): + pybind11_tests.bad_wchar_string() + + assert u8_Z() == 'Z' + assert u8_eacute() == u'Γ©' + assert u16_ibang() == u'β€½' + assert u32_mathbfA() == u'𝐀' + assert wchar_heart() == u'β™₯' + + +def test_single_char_arguments(): + """Tests failures for passing invalid inputs to char-accepting functions""" + from pybind11_tests import ord_char, ord_char16, ord_char32, ord_wchar, wchar_size + + def toobig_message(r): + return "Character code point not in range({0:#x})".format(r) + toolong_message = "Expected a character, but multi-character string found" + + assert ord_char(u'a') == 0x61 # simple ASCII + assert ord_char(u'Γ©') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char + with pytest.raises(ValueError) as excinfo: + assert ord_char(u'Δ€') == 0x100 # requires 2 bytes, doesn't fit in a char + assert str(excinfo.value) == toobig_message(0x100) + with pytest.raises(ValueError) as excinfo: + assert ord_char(u'ab') + assert str(excinfo.value) == toolong_message + + assert ord_char16(u'a') == 0x61 + assert ord_char16(u'Γ©') == 0xE9 + assert ord_char16(u'Δ€') == 0x100 + assert ord_char16(u'β€½') == 0x203d + assert ord_char16(u'β™₯') == 0x2665 + with pytest.raises(ValueError) as excinfo: + assert ord_char16(u'πŸŽ‚') == 0x1F382 # requires surrogate pair + assert str(excinfo.value) == toobig_message(0x10000) + with pytest.raises(ValueError) as excinfo: + assert ord_char16(u'aa') + assert str(excinfo.value) == toolong_message + + assert ord_char32(u'a') == 0x61 + assert ord_char32(u'Γ©') == 0xE9 + assert ord_char32(u'Δ€') == 0x100 + assert ord_char32(u'β€½') == 0x203d + assert ord_char32(u'β™₯') == 0x2665 + assert ord_char32(u'πŸŽ‚') == 0x1F382 + with pytest.raises(ValueError) as excinfo: + assert ord_char32(u'aa') + assert str(excinfo.value) == toolong_message + + assert ord_wchar(u'a') == 0x61 + assert ord_wchar(u'Γ©') == 0xE9 + assert ord_wchar(u'Δ€') == 0x100 + assert ord_wchar(u'β€½') == 0x203d + assert ord_wchar(u'β™₯') == 0x2665 + if wchar_size == 2: + with pytest.raises(ValueError) as excinfo: + assert ord_wchar(u'πŸŽ‚') == 0x1F382 # requires surrogate pair + assert str(excinfo.value) == toobig_message(0x10000) + else: + assert ord_wchar(u'πŸŽ‚') == 0x1F382 + with pytest.raises(ValueError) as excinfo: + assert ord_wchar(u'aa') + assert str(excinfo.value) == toolong_message