obs-studio/deps/libcaption/src/utf8.c

/**********************************************************************************************/
/* The MIT License                                                                            */
/*                                                                                            */
/* Copyright 2016-2017 Twitch Interactive, Inc. or its affiliates. All Rights Reserved.       */
/*                                                                                            */
/* Permission is hereby granted, free of charge, to any person obtaining a copy               */
/* of this software and associated documentation files (the "Software"), to deal              */
/* in the Software without restriction, including without limitation the rights               */
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell                  */
/* copies of the Software, and to permit persons to whom the Software is                      */
/* furnished to do so, subject to the following conditions:                                   */
/*                                                                                            */
/* The above copyright notice and this permission notice shall be included in                 */
/* all copies or substantial portions of the Software.                                        */
/*                                                                                            */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR                 */
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,                   */
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE                */
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER                     */
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,              */
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN                  */
/* THE SOFTWARE.                                                                              */
/**********************************************************************************************/

#include "utf8.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const utf8_char_t* utf8_char_next(const utf8_char_t* c)
{
    const utf8_char_t* n = c + utf8_char_length(c);
    return n == c ? 0 : n;
}

// returnes the length of the char in bytes
size_t utf8_char_length(const utf8_char_t* c)
{
    // count null term as zero size
    if (!c || 0x00 == c[0]) {
        return 0;
    }

    static const size_t _utf8_char_length[] = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
    };

    return _utf8_char_length[(c[0] >> 3) & 0x1F];
}

int utf8_char_whitespace(const utf8_char_t* c)
{
    // 0x7F is DEL
    if (!c || (c[0] >= 0 && c[0] <= ' ') || c[0] == 0x7F) {
        return 1;
    }

    // EIA608_CHAR_NO_BREAK_SPACE TODO other utf8 spaces
    if (0xC2 == (unsigned char)c[0] && 0xA0 == (unsigned char)c[1]) {
        return 1;
    }

    return 0;
}

// returns length of the string in bytes
// size is number of charcter to count (0 to count until NULL term)
size_t utf8_string_length(const utf8_char_t* data, utf8_size_t size)
{
    size_t char_length, byts = 0;

    if (0 == size) {
        size = utf8_char_count(data, 0);
    }

    for (; 0 < size; --size) {
        if (0 == (char_length = utf8_char_length(data))) {
            break;
        }

        data += char_length;
        byts += char_length;
    }

    return byts;
}

size_t utf8_char_copy(utf8_char_t* dst, const utf8_char_t* src)
{
    size_t bytes = utf8_char_length(src);

    if (bytes && dst) {
        memcpy(dst, src, bytes);
        dst[bytes] = '\0';
    }

    return bytes;
}

// returnes the number of utf8 charcters in a string given the number of bytes
// to count until the a null terminator, pass 0 for size
utf8_size_t utf8_char_count(const char* data, size_t size)
{
    size_t i, bytes = 0;
    utf8_size_t count = 0;

    if (0 == size) {
        size = strlen(data);
    }

    for (i = 0; i < size; ++count, i += bytes) {
        if (0 == (bytes = utf8_char_length(&data[i]))) {
            break;
        }
    }

    return count;
}

// returnes the length of the line in bytes triming not printable charcters at the end
size_t utf8_trimmed_length(const utf8_char_t* data, utf8_size_t charcters)
{
    size_t l, t = 0, split_at = 0;
    for (size_t c = 0; (*data) && c < charcters; ++c) {
        l = utf8_char_length(data);
        t += l, data += l;
        if (!utf8_char_whitespace(data)) {
            split_at = t;
        }
    }

    return split_at;
}

size_t _utf8_newline(const utf8_char_t* data)
{
    if ('\r' == data[0]) {
        return '\n' == data[1] ? 2 : 1; // windows/unix
    } else if ('\n' == data[0]) {
        return '\r' == data[1] ? 2 : 1; // riscos/macos
    } else {
        return 0;
    }
}
// returns the length in bytes of the line including the new line charcter(s)
// auto detects between windows(CRLF), unix(LF), mac(CR) and riscos (LFCR) line endings
size_t utf8_line_length(const utf8_char_t* data)
{
    size_t n, len = 0;

    for (len = 0; 0 != data[len]; ++len) {
        if (0 < (n = _utf8_newline(data))) {
            return len + n;
        }
    }

    return len;
}

// returns number of chars to include before split
utf8_size_t utf8_wrap_length(const utf8_char_t* data, utf8_size_t size)
{
    // Set split_at to size, so if a split point cna not be found, retuns the size passed in
    size_t char_length, char_count, split_at = size;

    for (char_count = 0; char_count <= size; ++char_count) {
        if (_utf8_newline(data)) {
            return char_count;
        } else if (utf8_char_whitespace(data)) {
            split_at = char_count;
        }

        char_length = utf8_char_length(data);
        data += char_length;
    }

    return split_at;
}

int utf8_line_count(const utf8_char_t* data)
{
    size_t len = 0;
    int count = 0;

    do {
        len = utf8_line_length(data);
        data += len;
        ++count;
    } while (0 < len);

    return count - 1;
}

utf8_char_t* utf8_load_text_file(const char* path, size_t* size)
{
    utf8_char_t* data = NULL;
    FILE* file = fopen(path, "r");

    if (file) {
        fseek(file, 0, SEEK_END);
        size_t file_size = ftell(file);
        fseek(file, 0, SEEK_SET);

        if (0 == (*size) || file_size <= (*size)) {
            (*size) = 0;
            data = (utf8_char_t*)malloc(1 + file_size);
            memset(data, '\0', file_size);

            if (data) {
                utf8_char_t* pos = data;
                size_t bytes_read = 0;

                while (0 < (bytes_read = fread(pos, 1, file_size - (*size), file))) {
                    pos += bytes_read;
                    (*size) += bytes_read;
                }
            }

            fclose(file);
        }
    }

    data[*size] = 0;
    return data;
}

#ifndef strnstr
char* strnstr(const char* string1, const char* string2, size_t len)
{
    size_t length2;

    length2 = strlen(string2);
    if (!length2) {
        return (char*)string1;
    }

    while (len >= length2) {
        len--;
        if (!memcmp(string1, string2, length2))
            return (char*)string1;
        string1++;
    }
    return NULL;
}
#endif