From 02a44bcfe35f868cbbf3cf44d2d26146391dcd88 Mon Sep 17 00:00:00 2001 From: Tenari Date: Sat, 23 May 2026 14:06:39 -0500 Subject: [PATCH] more string and utf8 codepoint logic --- all.h | 29 +++++++- string.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 231 insertions(+), 7 deletions(-) diff --git a/all.h b/all.h index 7916e07..03c3dca 100644 --- a/all.h +++ b/all.h @@ -338,6 +338,22 @@ typedef enum Utf8Character { Utf8Character_Count, } Utf8Character; +typedef struct Codepoint { + Utf8Character type; + u8 size; + u32 code; +} Codepoint; + +typedef struct TextPos { + i64 line; + i64 column; +} TextPos; + +typedef struct TextRange { + TextPos min; + TextPos max; +} TextRange; + typedef enum FieldType { FieldTypeU8, FieldTypeU16, @@ -576,6 +592,8 @@ typedef struct ThreadContext { ///// HARDCODED GLOBALS global const u64 MAX_u64 = 0xffffffffffffffffull; +global const i64 MAX_i64 = 9223372036854775807LL; +global const i64 MIN_i64 = (-9223372036854775807LL - 1); global const u32 MAX_u32 = 0xffffffff; global const u16 MAX_u16 = 0xffff; global const u8 MAX_u8 = 0xff; @@ -641,7 +659,8 @@ void scratchReturn(ScratchMem* scratch); fn bool stringsEq(String* a, String* b); fn bool cStringEqString(str a, String* b); -fn Utf8Character classifyUtf8Character(u8 c); +#define classifyUtf8Character utf8CharacterClassify +fn Utf8Character utf8CharacterClassify(u8 c); fn bool isUtf8Ascii(u8 c); fn bool isUtf8TwoByte(u8 c); fn bool isUtf8ThreeByte(u8 c); @@ -651,6 +670,14 @@ fn u8 upperAscii(u8 c); fn StringUTF16Const str16FromStr8(Arena* a, String string); fn bool isAlphaUnderscoreSpace(u8 c); fn bool isSimplePrintable(u8 c); +fn bool codepointIsWordBreak(Codepoint c); +fn bool codepointIsWhitespace(Codepoint c); +fn Codepoint codepointFromBytes(ptr bytes, u32 offset); +fn Codepoint codepointFromBytesBefore(ptr bytes, u32 offset); +fn Codepoint codepointFromRawInt(u32 c); +fn String stringFromRawCodepoint(Arena* a, u32 c); +fn bool stringInsertCodepointAtByte(String* s, Codepoint c, u32 byte_offset); +fn bool stringDeleteCodepointAtByte(String* s, u32 byte_offset); ///// OS-wrapped apis void osInit(); diff --git a/string.c b/string.c index a724458..7db0af0 100644 --- a/string.c +++ b/string.c @@ -24,7 +24,7 @@ fn bool cStringEqString(str a, String* b) { return true; } -fn Utf8Character classifyUtf8Character(u8 c) { +fn Utf8Character utf8CharacterClassify(u8 c) { /*two_byte utf8 starts with 1100. 192 three_byte utf8 starts with 1110. 224 four_byte utf8 starts with 1111. 240*/ @@ -42,6 +42,23 @@ fn Utf8Character classifyUtf8Character(u8 c) { } } +fn Utf8Character utf8CharacterClassifyUnsafe(u8 c) { + /*two_byte utf8 starts with 1100. 192 + three_byte utf8 starts with 1110. 224 + four_byte utf8 starts with 1111. 240*/ + if (c <= 127) { + return Utf8CharacterAscii; + } else if (c >= 192 && c < 224) { + return Utf8CharacterTwoByte; + } else if (c >= 224 && c < 240) { + return Utf8CharacterThreeByte; + } else if (c >= 240) { + return Utf8CharacterFourByte; + } else { + return Utf8Character_Count; + } +} + fn bool isUtf8Ascii(u8 c) { return classifyUtf8Character(c) == Utf8CharacterAscii; } @@ -79,6 +96,185 @@ fn bool isAlphaUnderscoreSpace(u8 c) { || c == '_'); } +fn bool codepointIsWordBreak(Codepoint c) { + return codepointIsWhitespace(c) || + c.code == ';' || c.code == ':' || + c.code == '.' || c.code == ',' || c.code == '!' || c.code == '?' || + c.code == '|' || c.code == '/' || c.code == '\\' + ; +} + +fn bool codepointIsWhitespace(Codepoint c) { + return c.code == ' ' || c.code == '\t' || c.code == '\n'; +} + +fn Codepoint codepointFromBytes(ptr bytes, u32 offset) { + Codepoint result = {0}; + result.type = utf8CharacterClassify(bytes[offset]); + switch (result.type) { + case Utf8CharacterAscii: { + result.size = 1; + result.code = bytes[offset]; + } break; + case Utf8CharacterTwoByte: { + result.size = 2; + result.code = ( + bytes[offset] << 8 | bytes[offset+1] + ); + } break; + case Utf8CharacterThreeByte: { + result.size = 3; + result.code = ( + bytes[offset] << 16 | bytes[offset+1] << 8 | bytes[offset+2] + ); + } break; + case Utf8CharacterFourByte: { + result.size = 4; + result.code = ( + bytes[offset] << 24 | bytes[offset+1] << 16 | bytes[offset+2] << 8 | bytes[offset+3] + ); + } break; + case Utf8Character_Count: { + printf("unabled to classify utf8 codepoint %d\n", bytes[offset]); + assert(false); + } break; + } + return result; +} + +fn Codepoint codepointFromBytesBefore(ptr bytes, u32 offset) { + assert(offset > 0); + Codepoint result = { .type = Utf8Character_Count }; + u32 i = 0; + while (result.type == Utf8Character_Count && i <= 4) { + i++; + offset -= 1; + result.type = utf8CharacterClassifyUnsafe(bytes[offset]); + } + switch (result.type) { + case Utf8CharacterAscii: { + result.size = 1; + result.code = bytes[offset]; + } break; + case Utf8CharacterTwoByte: { + result.size = 2; + result.code = ( + bytes[offset] << 8 | bytes[offset+1] + ); + } break; + case Utf8CharacterThreeByte: { + result.size = 3; + result.code = ( + bytes[offset] << 16 | bytes[offset+1] << 8 | bytes[offset+2] + ); + } break; + case Utf8CharacterFourByte: { + result.size = 4; + result.code = ( + bytes[offset] << 24 | bytes[offset+1] << 16 | bytes[offset+2] << 8 | bytes[offset+3] + ); + } break; + case Utf8Character_Count: { + printf("unabled to classify utf8 codepoint %d\n", bytes[offset]); + assert(false); + } break; + } + return result; +} + +fn Codepoint codepointFromRawInt(u32 c) { + Codepoint result = { .code = c }; + if (c <= 127) { // ascii + result.type = Utf8CharacterAscii; + result.size = 1; + } else if ((c >> 8) >= 192 && (c >> 8) < 224) { + result.type = Utf8CharacterTwoByte; + result.size = 2; + } else if ((c >> 16) >= 224 && (c >> 16) < 240) { + result.type = Utf8CharacterThreeByte; + result.size = 3; + } else if ((c >> 24) >= 240) { + result.type = Utf8CharacterFourByte; + result.size = 4; + } + return result; +} + +fn void codepointFillBuf(Codepoint cp, ptr buf) { + switch (cp.type) { + case Utf8CharacterAscii: { + buf[0] = cp.code; + } break; + case Utf8CharacterTwoByte: { + buf[0] = (cp.code & 0xFF00) >> 8; + buf[1] = cp.code & 0xFF; + } break; + case Utf8CharacterThreeByte: { + buf[0] = (cp.code & 0xFF0000) >> 16; + buf[1] = (cp.code & 0xFF00) >> 8; + buf[2] = cp.code & 0xFF; + } break; + case Utf8CharacterFourByte: { + buf[0] = (cp.code & 0xFF000000) >> 24; + buf[1] = (cp.code & 0xFF0000) >> 16; + buf[2] = (cp.code & 0xFF00) >> 8; + buf[3] = cp.code & 0xFF; + } break; + case Utf8Character_Count: { + assert(false); + } break; + } +} + +fn String stringFromRawCodepoint(Arena* a, u32 c) { + String result = {0}; + Codepoint codepoint = codepointFromRawInt(c); + switch (codepoint.type) { + case Utf8CharacterAscii: result.capacity = 1; break; + case Utf8CharacterTwoByte: result.capacity = 2; break; + case Utf8CharacterThreeByte: result.capacity = 3; break; + case Utf8CharacterFourByte: result.capacity = 4; break; + case Utf8Character_Count: { + assert(false); + } break; + } + result.length = result.capacity; + result.bytes = arenaAlloc(a, result.length); + codepointFillBuf(codepoint, result.bytes); + return result; +} + +fn bool stringInsertCodepointAtByte(String* s, Codepoint c, u32 byte_offset) { + u32 remaining_space = s->capacity - s->length; + if (remaining_space < c.size) return false; + + char codepoint_bytes[4]; + codepointFillBuf(c, codepoint_bytes); + // shift all the bytes over from the byte_offset onward + for (u32 i = 0; i < c.size; i++) { + for (u32 ii = s->length; ii > byte_offset+i; ii--) { + s->bytes[ii] = s->bytes[ii-1]; + } + s->length += 1; + s->bytes[byte_offset+i] = codepoint_bytes[i]; + } + return true; +} + +fn bool stringDeleteCodepointAtByte(String* s, u32 byte_offset) { + if (s->length < byte_offset) return false; + + Codepoint cp = codepointFromBytes(s->bytes, byte_offset); + // shift all the bytes from the back towards the byte_offset + for (u32 i = 0; i < cp.size; i++) { + for (u32 ii = byte_offset+i; ii < s->length; ii++) { + s->bytes[ii] = s->bytes[ii+1]; + } + s->length -= 1; + } + return true; +} + fn bool isSimplePrintable(u8 c) { return (c >= ' ' && c <= '~'); } @@ -88,7 +284,7 @@ typedef struct StrDecode { u32 size; } StrDecode; -fn StrDecode strDecodeUTF8(u8 *str, u32 cap){ +fn StrDecode strDecodeUTF8(u8 *string, u32 cap){ u8 length[] = { 1, 1, 1, 1, // 000xx 1, 1, 1, 1, @@ -109,14 +305,14 @@ fn StrDecode strDecodeUTF8(u8 *str, u32 cap){ result.codepoint = '#'; result.size = 1; - u8 byte = str[0]; + u8 byte = string[0]; u8 l = length[byte >> 3]; if (0 < l && l <= cap){ u32 cp = (byte & first_byte_mask[l]) << 18; switch (l){ - case 4: cp |= ((str[3] & 0x3F) << 0); - case 3: cp |= ((str[2] & 0x3F) << 6); - case 2: cp |= ((str[1] & 0x3F) << 12); + case 4: cp |= ((string[3] & 0x3F) << 0); + case 3: cp |= ((string[2] & 0x3F) << 6); + case 2: cp |= ((string[1] & 0x3F) << 12); default: break; } cp >>= final_shift[l]; @@ -214,3 +410,4 @@ fn StringUTF16Const str16FromStr8(Arena* arena, String string) { StringUTF16Const result = { memory, string_count }; return result; } +