From ee9296395d604732b73a27a78f47c9f6465a83c3 Mon Sep 17 00:00:00 2001 From: Jason Rhinelander Date: Thu, 9 Mar 2017 11:35:28 -0500 Subject: [PATCH] Call PyUnicode_DecodeUTF* directly Some versions of Python 2.7 reportedly (#713) have issues with PyUnicode_Decode being passed the encoding string, so just skip it entirely by calling the PyUnicode_DecodeUTF* function directly. This will also be slightly more efficient by avoiding having to check the encoding string, and (for python 2) going through the unicode class's decode (python 3 fast-tracks this for all utf-{8,16,32} encodings; python 2 only fast-tracked for the exact string "utf-8", which we weren't passing anyway (we had "utf8")). This doesn't work for PyPy, however: its `PyUnicode_DecodeUTF{8,16,32}` appear rather broken: the UTF8 one segfaults, while the 16/32 require recasting into a non-const `char *` (and might segfault; I didn't get far enough to find out). Just avoid the whole thing by keeping the encoding-passed-as-string version for PyPy, which seems to work reliably. --- include/pybind11/cast.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 30966006..b8638b9c 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -641,7 +641,6 @@ struct type_caster, enable_if_t::value || sizeof(CharT) == 2 || sizeof(CharT) == 4, "Unsupported wchar_t size != 2/4"); static constexpr size_t UTF_N = 8 * sizeof(CharT); - static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32"; using StringType = std::basic_string; @@ -666,7 +665,7 @@ struct type_caster, enable_if_t(PyUnicode_AsEncodedString( - load_src.ptr(), encoding, nullptr)); + load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr)); if (!utfNbytes) { PyErr_Clear(); return false; } const CharT *buffer = reinterpret_cast(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr())); @@ -679,12 +678,28 @@ struct type_caster, enable_if_t(src.c_str()); ssize_t nbytes = ssize_t(src.size() * sizeof(CharT)); - handle s = PyUnicode_Decode(buffer, nbytes, encoding, nullptr); + handle s = decode_utfN(buffer, nbytes); if (!s) throw error_already_set(); return s; } PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME)); + +private: + static handle decode_utfN(const char *buffer, ssize_t nbytes) { +#if !defined(PYPY_VERSION) + return + UTF_N == 8 ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) : + UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) : + PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr); +#else + // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version + // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a + // non-const char * arguments, which is also a nuissance, so bypass the whole thing by just + // passing the encoding as a string value, which works properly: + return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr); +#endif + } }; // Type caster for C-style strings. We basically use a std::string type caster, but also add the