/* unicode-utils.h * Unicode utility definitions * * Wireshark - Network traffic analyzer * By Gerald Combs * Copyright 2006 Gerald Combs * * SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __UNICODEUTIL_H__ #define __UNICODEUTIL_H__ #include #ifdef _WIN32 #include #include #include #endif /** * @file * Unicode convenience routines. */ #ifdef __cplusplus extern "C" { #endif #ifdef WS_DEBUG_UTF_8 #define DEBUG_UTF_8_ENABLED true #else #define DEBUG_UTF_8_ENABLED false #endif #define _CHECK_UTF_8(level, str, len) \ do { \ const char *__uni_endptr; \ if (DEBUG_UTF_8_ENABLED && (str) != NULL && \ !g_utf8_validate(str, len, &__uni_endptr)) { \ ws_log_utf8(str, len, __uni_endptr); \ } \ } while (0) #define WS_UTF_8_CHECK(str, len) \ _CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len) #define WS_UTF_8_DEBUG_HERE(str, len) \ _CHECK_UTF_8(LOG_LEVEL_ECHO, str, len) WSUTIL_EXPORT const int ws_utf8_seqlen[256]; /** * @brief Returns the length of a UTF-8 multibyte sequence from its first byte. * * Determines the expected number of bytes in a UTF-8 encoded code point * based on the leading byte. Returns 0 if the byte is invalid as a UTF-8 starter. * * @param ch The first byte of a UTF-8 sequence. * @return Length of the UTF-8 sequence (1–4), or 0 if invalid. */ #define ws_utf8_char_len(ch) (ws_utf8_seqlen[(ch)]) /** * @brief Validates and sanitizes a UTF-8 byte sequence. * * Processes a raw byte string of length `length`, replacing any ill-formed * UTF-8 sequences with the Unicode REPLACEMENT CHARACTER (U+FFFD). * The result is allocated using the provided `wmem` scope. * * @param scope Memory allocator scope for the returned string. * @param ptr Pointer to the input byte sequence. * @param length Length of the input sequence. * @return Pointer to a valid UTF-8 string, allocated via `scope`. */ WS_DLL_PUBLIC uint8_t * ws_utf8_make_valid(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length); /** * @brief Validates a UTF-8 byte sequence and returns a string buffer. * * Similar to `ws_utf8_make_valid()`, but returns a `wmem_strbuf_t` object * for easier manipulation and appending. Ill-formed sequences are replaced * with the Unicode REPLACEMENT CHARACTER. * * @param scope Memory allocator scope for the returned buffer. * @param ptr Pointer to the input byte sequence. * @param length Length of the input sequence. * @return Pointer to a valid UTF-8 string buffer. */ WS_DLL_PUBLIC wmem_strbuf_t * ws_utf8_make_valid_strbuf(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length); #ifdef _WIN32 /** * @brief Given a UTF-8 string, convert it to UTF-16. This is meant to be used * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16). * * @param utf8str The string to convert. May be NULL. * @return The string converted to UTF-16. If utf8str is NULL, returns * NULL. The return value should NOT be freed by the caller. */ WS_DLL_PUBLIC const wchar_t * utf_8to16(const char *utf8str); /** * @brief Create a UTF-16 string (in place) according to the format string. * * @param utf16buf The buffer to return the UTF-16 string in. * @param utf16buf_len The size of the 'utf16buf' parameter * @param fmt A standard printf() format string */ WS_DLL_PUBLIC void utf_8to16_snprintf(TCHAR *utf16buf, int utf16buf_len, const char* fmt, ...) G_GNUC_PRINTF(3, 4); /** * @brief Given a UTF-16 string, convert it to UTF-8. This is meant to be used * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16). * * @param utf16str The string to convert. May be NULL. * @return The string converted to UTF-8. If utf16str is NULL, returns * NULL. The return value should NOT be freed by the caller. */ WS_DLL_PUBLIC char * utf_16to8(const wchar_t *utf16str); /** * @brief Converts a UTF-16 argument list to UTF-8. * * Converts a program's command-line arguments from UTF-16 (typically used on Windows) * to UTF-8 encoding. This is useful for normalizing input at startup to ensure consistent * string handling across platforms and libraries. * * The returned array is allocated using standard memory allocation routines and must be * freed by the caller. Each string in the array is individually allocated. * * @param argc The number of arguments. * @param wc_argv Array of UTF-16 encoded argument strings. * @return Pointer to an array of UTF-8 encoded strings, or NULL on failure. */ WS_DLL_PUBLIC char **arg_list_utf_16to8(int argc, wchar_t *wc_argv[]); #endif /* _WIN32 */ /* * defines for helping with UTF-16 surrogate pairs */ #define IS_LEAD_SURROGATE(uchar2) \ ((uchar2) >= 0xd800 && (uchar2) < 0xdc00) #define IS_TRAIL_SURROGATE(uchar2) \ ((uchar2) >= 0xdc00 && (uchar2) < 0xe000) #define SURROGATE_VALUE(lead, trail) \ (((((lead) - 0xd800) << 10) | ((trail) - 0xdc00)) + 0x10000) #ifdef __cplusplus } #endif #endif /* __UNICODEUTIL_H__ */