LeOS-Genesis/external/badvpn/misc/unicode_funcs.h

/**
 * @file unicode_funcs.h
 * @author Ambroz Bizjak <ambrop7@gmail.com>
 * 
 * @section LICENSE
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the author nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef BADVPN_UNICODE_FUNCS_H
#define BADVPN_UNICODE_FUNCS_H

#include <misc/expstring.h>
#include <misc/bsize.h>
#include <misc/Utf8Encoder.h>
#include <misc/Utf8Decoder.h>
#include <misc/Utf16Encoder.h>
#include <misc/Utf16Decoder.h>

/**
 * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
 * 
 * @param data UTF-16 data, in big endian
 * @param data_len size of data in bytes
 * @param out_is_error if not NULL and the function returns a string,
 *                     *out_is_error will be set to 0 or 1, indicating
 *                     whether there have been errors decoding the input.
 *                     A null decoded character is treated as an error.
 * @return An UTF-8 null-terminated string which can be freed with free(),
 *         or NULL if out of memory.
 */
static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);

/**
 * Decodes UTF-8 data into UTF-16 data as bytes.
 * 
 * @param data UTF-8 data
 * @param data_len size of data in bytes
 * @param out output buffer
 * @param out_avail number of bytes available in output buffer
 * @param out_len if not NULL, *out_len will contain the number of bytes
 *                required to store the resulting data (or overflow)
 * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
 *                     indicating whether there have been errors decoding
 *                     the input
 */
static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);

static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
{
    // will build the resulting UTF-8 string by appending to ExpString
    ExpString str;
    if (!ExpString_Init(&str)) {
        goto fail0;
    }
    
    // init UTF-16 decoder
    Utf16Decoder decoder;
    Utf16Decoder_Init(&decoder);
    
    // set initial input and input matching positions
    size_t i_in = 0;
    size_t i_ch = 0;
    
    int error = 0;
    
    while (i_in < data_len) {
        // read two input bytes from the input position
        uint8_t x = data[i_in++];
        if (i_in == data_len) {
            break;
        }
        uint8_t y = data[i_in++];
        
        // combine them into a 16-bit value
        uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
        
        // give the 16-bit value to the UTF-16 decoder and maybe
        // receive a Unicode character back
        uint32_t ch;
        if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
            continue;
        }
        
        if (!error) {
            // encode the Unicode character back into UTF-16
            uint16_t chenc[2];
            int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
            ASSERT(chenc_n > 0)
            
            // match the result with input
            for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
                uint8_t cx = (chenc[chenc_i] >> 8);
                uint8_t cy = (chenc[chenc_i] & 0xFF);
                
                if (i_ch >= data_len || data[i_ch] != cx) {
                    error = 1;
                    break;
                }
                i_ch++;
                
                if (i_ch >= data_len || data[i_ch] != cy) {
                    error = 1;
                    break;
                }
                i_ch++;
            }
        }
        
        // we don't like null Unicode characters because we're building a
        // null-terminated UTF-8 string
        if (ch == 0) {
            error = 1;
            continue;
        }
        
        // encode the Unicode character into UTF-8
        uint8_t enc[5];
        int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
        ASSERT(enc_n > 0)
        
        // append the resulting UTF-8 bytes to the result string
        enc[enc_n] = 0;
        if (!ExpString_Append(&str, enc)) {
            goto fail1;
        }
    }
    
    // check if we matched the whole input string when encoding back
    if (i_ch < data_len) {
        error = 1;
    }
    
    if (out_is_error) {
        *out_is_error = error;
    }
    return ExpString_Get(&str);
    
fail1:
    ExpString_Free(&str);
fail0:
    return NULL;
}

static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
{
    Utf8Decoder decoder;
    Utf8Decoder_Init(&decoder);
    
    size_t i_in = 0;
    size_t i_ch = 0;
    
    bsize_t len = bsize_fromsize(0);
    
    int error = 0;
    
    while (i_in < data_len) {
        uint8_t x = data[i_in++];
        
        uint32_t ch;
        if (!Utf8Decoder_Input(&decoder, x, &ch)) {
            continue;
        }
        
        if (!error) {
            uint8_t chenc[4];
            int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
            ASSERT(chenc_n > 0)
            
            for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
                if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
                    error = 1;
                    break;
                }
                i_ch++;
            }
        }
        
        uint16_t enc[2];
        int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
        ASSERT(enc_n > 0)
        
        len = bsize_add(len, bsize_fromsize(2 * enc_n));
        
        for (int enc_i = 0; enc_i < enc_n; enc_i++) {
            if (out_avail == 0) {
                break;
            }
            *(out++) = (enc[enc_i] >> 8);
            out_avail--;
            
            if (out_avail == 0) {
                break;
            }
            *(out++) = (enc[enc_i] & 0xFF);
            out_avail--;
        }
    }
    
    if (i_ch < data_len) {
        error = 1;
    }
    
    if (out_len) {
        *out_len = len;
    }
    if (out_is_error) {
        *out_is_error = error;
    }
}

#endif
Bug Fixes 2020-10-05 13:12:00 +02:00			`/**`
			`* @file unicode_funcs.h`
			`* @author Ambroz Bizjak <ambrop7@gmail.com>`
			`*`
			`* @section LICENSE`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions are met:`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* 3. Neither the name of the author nor the`
			`* names of its contributors may be used to endorse or promote products`
			`* derived from this software without specific prior written permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND`
			`* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED`
			`* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE`
			`* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY`
			`* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES`
			`* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;`
			`* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND`
			`* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS`
			`* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#ifndef BADVPN_UNICODE_FUNCS_H`
			`#define BADVPN_UNICODE_FUNCS_H`

			`#include <misc/expstring.h>`
			`#include <misc/bsize.h>`
			`#include <misc/Utf8Encoder.h>`
			`#include <misc/Utf8Decoder.h>`
			`#include <misc/Utf16Encoder.h>`
			`#include <misc/Utf16Decoder.h>`

			`/**`
			`* Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.`
			`*`
			`* @param data UTF-16 data, in big endian`
			`* @param data_len size of data in bytes`
			`* @param out_is_error if not NULL and the function returns a string,`
			`* *out_is_error will be set to 0 or 1, indicating`
			`* whether there have been errors decoding the input.`
			`* A null decoded character is treated as an error.`
			`* @return An UTF-8 null-terminated string which can be freed with free(),`
			`* or NULL if out of memory.`
			`*/`
			`static char * unicode_decode_utf16_to_utf8 (const uint8_t data, size_t data_len, int out_is_error);`

			`/**`
			`* Decodes UTF-8 data into UTF-16 data as bytes.`
			`*`
			`* @param data UTF-8 data`
			`* @param data_len size of data in bytes`
			`* @param out output buffer`
			`* @param out_avail number of bytes available in output buffer`
			`* @param out_len if not NULL, *out_len will contain the number of bytes`
			`* required to store the resulting data (or overflow)`
			`* @param out_is_error if not NULL, *out_is_error will contain 0 or 1,`
			`* indicating whether there have been errors decoding`
			`* the input`
			`*/`
			`static void unicode_decode_utf8_to_utf16 (const uint8_t data, size_t data_len, uint8_t out, size_t out_avail, bsize_t out_len, int out_is_error);`

			`static char * unicode_decode_utf16_to_utf8 (const uint8_t data, size_t data_len, int out_is_error)`
			`{`
			`// will build the resulting UTF-8 string by appending to ExpString`
			`ExpString str;`
			`if (!ExpString_Init(&str)) {`
			`goto fail0;`
			`}`

			`// init UTF-16 decoder`
			`Utf16Decoder decoder;`
			`Utf16Decoder_Init(&decoder);`

			`// set initial input and input matching positions`
			`size_t i_in = 0;`
			`size_t i_ch = 0;`

			`int error = 0;`

			`while (i_in < data_len) {`
			`// read two input bytes from the input position`
			`uint8_t x = data[i_in++];`
			`if (i_in == data_len) {`
			`break;`
			`}`
			`uint8_t y = data[i_in++];`

			`// combine them into a 16-bit value`
			`uint16_t xy = (((uint16_t)x << 8) \| (uint16_t)y);`

			`// give the 16-bit value to the UTF-16 decoder and maybe`
			`// receive a Unicode character back`
			`uint32_t ch;`
			`if (!Utf16Decoder_Input(&decoder, xy, &ch)) {`
			`continue;`
			`}`

			`if (!error) {`
			`// encode the Unicode character back into UTF-16`
			`uint16_t chenc[2];`
			`int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);`
			`ASSERT(chenc_n > 0)`

			`// match the result with input`
			`for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {`
			`uint8_t cx = (chenc[chenc_i] >> 8);`
			`uint8_t cy = (chenc[chenc_i] & 0xFF);`

			`if (i_ch >= data_len \|\| data[i_ch] != cx) {`
			`error = 1;`
			`break;`
			`}`
			`i_ch++;`

			`if (i_ch >= data_len \|\| data[i_ch] != cy) {`
			`error = 1;`
			`break;`
			`}`
			`i_ch++;`
			`}`
			`}`

			`// we don't like null Unicode characters because we're building a`
			`// null-terminated UTF-8 string`
			`if (ch == 0) {`
			`error = 1;`
			`continue;`
			`}`

			`// encode the Unicode character into UTF-8`
			`uint8_t enc[5];`
			`int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);`
			`ASSERT(enc_n > 0)`

			`// append the resulting UTF-8 bytes to the result string`
			`enc[enc_n] = 0;`
			`if (!ExpString_Append(&str, enc)) {`
			`goto fail1;`
			`}`
			`}`

			`// check if we matched the whole input string when encoding back`
			`if (i_ch < data_len) {`
			`error = 1;`
			`}`

			`if (out_is_error) {`
			`*out_is_error = error;`
			`}`
			`return ExpString_Get(&str);`

			`fail1:`
			`ExpString_Free(&str);`
			`fail0:`
			`return NULL;`
			`}`

			`static void unicode_decode_utf8_to_utf16 (const uint8_t data, size_t data_len, uint8_t out, size_t out_avail, bsize_t out_len, int out_is_error)`
			`{`
			`Utf8Decoder decoder;`
			`Utf8Decoder_Init(&decoder);`

			`size_t i_in = 0;`
			`size_t i_ch = 0;`

			`bsize_t len = bsize_fromsize(0);`

			`int error = 0;`

			`while (i_in < data_len) {`
			`uint8_t x = data[i_in++];`

			`uint32_t ch;`
			`if (!Utf8Decoder_Input(&decoder, x, &ch)) {`
			`continue;`
			`}`

			`if (!error) {`
			`uint8_t chenc[4];`
			`int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);`
			`ASSERT(chenc_n > 0)`

			`for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {`
			`if (i_ch >= data_len \|\| data[i_ch] != chenc[chenc_i]) {`
			`error = 1;`
			`break;`
			`}`
			`i_ch++;`
			`}`
			`}`

			`uint16_t enc[2];`
			`int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);`
			`ASSERT(enc_n > 0)`

			`len = bsize_add(len, bsize_fromsize(2 * enc_n));`

			`for (int enc_i = 0; enc_i < enc_n; enc_i++) {`
			`if (out_avail == 0) {`
			`break;`
			`}`
			`*(out++) = (enc[enc_i] >> 8);`
			`out_avail--;`

			`if (out_avail == 0) {`
			`break;`
			`}`
			`*(out++) = (enc[enc_i] & 0xFF);`
			`out_avail--;`
			`}`
			`}`

			`if (i_ch < data_len) {`
			`error = 1;`
			`}`

			`if (out_len) {`
			`*out_len = len;`
			`}`
			`if (out_is_error) {`
			`*out_is_error = error;`
			`}`
			`}`

			`#endif`