Idris-dev/rts/idris_utf8.c
2020-01-21 08:56:34 +01:00

181 lines
4.2 KiB
C

#include "idris_utf8.h"
#include <string.h>
#include <stdlib.h>
int idris_utf8_strlen(char *s) {
int i = 0, j = 0;
while (s[i]) {
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}
int idris_utf8_charlen(char* s) {
int init = (int)s[0];
if ((init & 0x80) == 0) {
return 1; // Top bit unset, so 1 byte
}
if ((init > 244 && init < 256) ||
(init == 192) ||
(init == 193)) {
return 1; // Invalid characters
}
int i = 1;
while ((s[i] & 0xc0) == 0x80) {
i++; // Move on until top 2 bits are not 10
}
return i;
}
unsigned idris_utf8_index(char* s, int idx) {
int i = 0, j = 0;
while (j < idx) {
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
// Find the start of the next character
while ((s[i] & 0xc0) == 0x80) { i++; }
unsigned bytes = 0;
unsigned top = 0;
int init = (int)s[1];
// s[i] is now the start of the character we want
if ((s[i] & 0x80) == 0) {
bytes = 1;
top = (int)(s[i]);
} else if ((init > 244 && init < 256) ||
(init == 192) ||
(init == 193)) {
bytes = 1;
top = (int)(s[i]); // Invalid characters
} else if ((s[i] & 0xe0) == 0xc0) {
bytes = 2;
top = (int)(s[i] & 0x1f); // 5 bits
} else if ((s[i] & 0xf0) == 0xe0) {
bytes = 3;
top = (int)(s[i] & 0x0f); // 4 bits
} else if ((s[i] & 0xf8) == 0xf0) {
bytes = 4;
top = (int)(s[i] & 0x07); // 3 bits
} else if ((s[i] & 0xfc) == 0xf8) {
bytes = 5;
top = (int)(s[i] & 0x03); // 2 bits
} else if ((s[i] & 0xfe) == 0xfc) {
bytes = 6;
top = (int)(s[i] & 0x01); // 1 bits
}
while (bytes > 1) {
top = top << 6;
top += s[++i] & 0x3f; // 6 bits
--bytes;
}
return top;
}
char* idris_utf8_advance(char* str, int i) {
while (i > 0 && *str != '\0') {
// In a UTF8 single-byte char, the highest bit is 0. In the
// first byte of a multi-byte char, the highest two bits are
// 11, but the rest of the bytes start with 10. So we can
// decrement our character counter when we see something other
// than 10 at the front.
// This is a bit of an overapproximation, as invalid multibyte
// sequences that are too long will be treated as if they are
// OK, but it's always paying attention to null-termination.
if ((*str & 0xc0) != 0x80) {
i--;
}
str++;
}
// Now we've found the first byte of the last character. Advance
// to the end of it, or the end of the string, whichever is first.
// Here, we don't risk overrunning the end of the string because
// ('\0' & 0xc0) != 0x80.
while ((*str & 0xc0) == 0x80) { str++; }
return str;
}
int idris_utf8_findOffset(char* str, int i) {
int offset = 0;
while(i > 0) {
int len = idris_utf8_charlen(str);
str+=len;
offset+=len;
i--;
}
return offset;
}
char* idris_utf8_fromChar(int x) {
char* str;
int bytes = 0, top = 0;
if (x < 0x80) {
str = malloc(2);
str[0] = (char)x;
str[1] = '\0';
return str;
}
if (x >= 0x80 && x <= 0x7ff) {
bytes = 2;
top = 0xc0;
} else if (x >= 0x800 && x <= 0xffff) {
bytes = 3;
top = 0xe0;
} else if (x >= 0x10000 && x <= 0x10ffff) {
bytes = 4;
top = 0xf0;
}
str = malloc(bytes + 1);
str[bytes] = '\0';
while(bytes > 0) {
int xbits = x & 0x3f; // Next 6 bits
bytes--;
if (bytes > 0) {
str[bytes] = (char)xbits + 0x80;
} else {
str[0] = (char)xbits + top;
}
x = x >> 6;
}
return str;
}
void reverse_range(char *start, char *end)
{
while(start < end)
{
char c = *start;
*start++ = *end;
*end-- = c;
}
}
char* reverse_char(char *start)
{
char *end = start;
while((end[1] & 0xc0) == 0x80) { end++; }
reverse_range(start, end);
return(end + 1);
}
char* idris_utf8_rev(char* s, char* result) {
strcpy(result, s);
char* end = result;
while(*end) { end = reverse_char(end); }
reverse_range(result, end-1);
return result;
}