cbase/string.c

#include "all.h"

fn bool stringsEq(String* a, String* b) {
  if (a->length != b->length) {
    return false;
  }
  for (u32 i = 0; i < a->length; i++) {
    if (a->bytes[i] != b->bytes[i]) {
      return false;
    }
  }
  return true;
}

fn bool cStringEqString(str a, String* b) {
  if (strlen(a) != b->length) {
    return false;
  }
  for (u32 i = 0; i < b->length; i++) {
    if (a[i] != b->bytes[i]) {
      return false;
    }
  }
  return true;
}

fn Utf8Character utf8CharacterClassify(u8 c) {
  /*two_byte utf8 starts with 1100. 192
  three_byte utf8 starts with 1110. 224
  four_byte utf8 starts with 1111. 240*/
  if (c <= 127) {
    return Utf8CharacterAscii;
  } else if (c >= 192 && c < 224) {
    return Utf8CharacterTwoByte;
  } else if (c >= 224 && c < 240) {
    return Utf8CharacterThreeByte;
  } else if (c >= 240) {
    return Utf8CharacterFourByte;
  } else {
    assert(false && "Not a valid utf8 starting byte");
    return Utf8Character_Count;
  }
}

fn Utf8Character utf8CharacterClassifyUnsafe(u8 c) {
  /*two_byte utf8 starts with 1100. 192
  three_byte utf8 starts with 1110. 224
  four_byte utf8 starts with 1111. 240*/
  if (c <= 127) {
    return Utf8CharacterAscii;
  } else if (c >= 192 && c < 224) {
    return Utf8CharacterTwoByte;
  } else if (c >= 224 && c < 240) {
    return Utf8CharacterThreeByte;
  } else if (c >= 240) {
    return Utf8CharacterFourByte;
  } else {
    return Utf8Character_Count;
  }
}

fn bool isUtf8Ascii(u8 c) {
  return classifyUtf8Character(c) == Utf8CharacterAscii;
}

fn bool isUtf8TwoByte(u8 c) {
  return classifyUtf8Character(c) == Utf8CharacterTwoByte;
}

fn bool isUtf8ThreeByte(u8 c) {
  return classifyUtf8Character(c) == Utf8CharacterThreeByte;
}

fn bool isUtf8FourByte(u8 c) {
  return classifyUtf8Character(c) == Utf8CharacterFourByte;
}

fn u8 lowerAscii(u8 c) {
  if (c >= 65 && c <= 90) {
    return c + 32;
  }
  return c;
}

fn u8 upperAscii(u8 c) {
  if (c >= 97 && c <= 122) {
    return c - 32;
  }
  return c;
}

fn bool isAlphaUnderscoreSpace(u8 c) {
  return ((c >= 'A' && c <= 'Z')
        || (c >= 'a' && c <= 'z')
        || c == ' '
        || c == '_');
}

fn bool codepointIsWordBreak(Codepoint c) {
  return codepointIsWhitespace(c) ||
    c.code == ';' || c.code == ':' ||
    c.code == '.' || c.code == ',' || c.code == '!' || c.code == '?' ||
    c.code == '|' || c.code == '/' || c.code == '\\'
    ;
}

fn bool codepointIsWhitespace(Codepoint c) {
  return c.code == ' ' || c.code == '\t' || c.code == '\n';
}

fn Codepoint codepointFromBytes(ptr bytes, u32 offset) {
  Codepoint result = {0};
  result.type = utf8CharacterClassify(bytes[offset]);
  switch (result.type) {
    case Utf8CharacterAscii: {
      result.size = 1;
      result.code = bytes[offset];
    } break;
    case Utf8CharacterTwoByte: {
      result.size = 2;
      result.code = (
        bytes[offset] << 8 | bytes[offset+1]
      );
    } break;
    case Utf8CharacterThreeByte: {
      result.size = 3;
      result.code = (
        bytes[offset] << 16 | bytes[offset+1] << 8 | bytes[offset+2]
      );
    } break;
    case Utf8CharacterFourByte: {
      result.size = 4;
      result.code = (
        bytes[offset] << 24 | bytes[offset+1] << 16 | bytes[offset+2] << 8 | bytes[offset+3]
      );
    } break;
    case Utf8Character_Count: {
      printf("unabled to classify utf8 codepoint %d\n", bytes[offset]);
      assert(false);
    } break;
  }
  return result;
}

fn Codepoint codepointFromBytesBefore(ptr bytes, u32 offset) {
  assert(offset > 0);
  Codepoint result = { .type = Utf8Character_Count };
  u32 i = 0;
  while (result.type == Utf8Character_Count && i <= 4) {
    i++;
    offset -= 1;
    result.type = utf8CharacterClassifyUnsafe(bytes[offset]);
  }
  switch (result.type) {
    case Utf8CharacterAscii: {
      result.size = 1;
      result.code = bytes[offset];
    } break;
    case Utf8CharacterTwoByte: {
      result.size = 2;
      result.code = (
        bytes[offset] << 8 | bytes[offset+1]
      );
    } break;
    case Utf8CharacterThreeByte: {
      result.size = 3;
      result.code = (
        bytes[offset] << 16 | bytes[offset+1] << 8 | bytes[offset+2]
      );
    } break;
    case Utf8CharacterFourByte: {
      result.size = 4;
      result.code = (
        bytes[offset] << 24 | bytes[offset+1] << 16 | bytes[offset+2] << 8 | bytes[offset+3]
      );
    } break;
    case Utf8Character_Count: {
      printf("unabled to classify utf8 codepoint %d\n", bytes[offset]);
      assert(false);
    } break;
  }
  return result;
}

fn Codepoint codepointFromRawInt(u32 c) {
  Codepoint result = { .code = c };
  if (c <= 127) { // ascii
    result.type = Utf8CharacterAscii;
    result.size = 1;
  } else if ((c >> 8) >= 192 && (c >> 8) < 224) {
    result.type = Utf8CharacterTwoByte;
    result.size = 2;
  } else if ((c >> 16) >= 224 && (c >> 16) < 240) {
    result.type = Utf8CharacterThreeByte;
    result.size = 3;
  } else if ((c >> 24) >= 240) {
    result.type = Utf8CharacterFourByte;
    result.size = 4;
  }
  return result;
}

fn void codepointFillBuf(Codepoint cp, ptr buf) {
  switch (cp.type) {
    case Utf8CharacterAscii: {
      buf[0] = cp.code;
    } break;
    case Utf8CharacterTwoByte: {
      buf[0] = (cp.code & 0xFF00) >> 8;
      buf[1] = cp.code & 0xFF;
    } break;
    case Utf8CharacterThreeByte: {
      buf[0] = (cp.code & 0xFF0000) >> 16;
      buf[1] = (cp.code & 0xFF00) >> 8;
      buf[2] = cp.code & 0xFF;
    } break;
    case Utf8CharacterFourByte: {
      buf[0] = (cp.code & 0xFF000000) >> 24;
      buf[1] = (cp.code & 0xFF0000) >> 16;
      buf[2] = (cp.code & 0xFF00) >> 8;
      buf[3] = cp.code & 0xFF;
    } break;
    case Utf8Character_Count: {
      assert(false);
    } break;
  }
}

fn String stringFromRawCodepoint(Arena* a, u32 c) {
  String result = {0};
  Codepoint codepoint = codepointFromRawInt(c);
  switch (codepoint.type) {
    case Utf8CharacterAscii:      result.capacity = 1; break;
    case Utf8CharacterTwoByte:    result.capacity = 2; break;
    case Utf8CharacterThreeByte:  result.capacity = 3; break;
    case Utf8CharacterFourByte:   result.capacity = 4; break;
    case Utf8Character_Count: {
      assert(false);
    } break;
  }
  result.length = result.capacity;
  result.bytes = arenaAlloc(a, result.length);
  codepointFillBuf(codepoint, result.bytes);
  return result;
}

fn bool stringInsertCodepointAtByte(String* s, Codepoint c, u32 byte_offset) {
  u32 remaining_space = s->capacity - s->length;
  if (remaining_space < c.size) return false;

  char codepoint_bytes[4];
  codepointFillBuf(c, codepoint_bytes);
  // shift all the bytes over from the byte_offset onward
  for (u32 i = 0; i < c.size; i++) {
    for (u32 ii = s->length; ii > byte_offset+i; ii--) {
      s->bytes[ii] = s->bytes[ii-1];
    }
    s->length += 1;
    s->bytes[byte_offset+i] = codepoint_bytes[i];
  }
  return true;
}

fn bool stringDeleteCodepointAtByte(String* s, u32 byte_offset) {
  if (s->length < byte_offset) return false;

  Codepoint cp = codepointFromBytes(s->bytes, byte_offset);
  // shift all the bytes from the back towards the byte_offset
  for (u32 i = 0; i < cp.size; i++) {
    for (u32 ii = byte_offset+i; ii < s->length; ii++) {
      s->bytes[ii] = s->bytes[ii+1];
    }
    s->length -= 1;
  }
  return true;
}

fn bool isSimplePrintable(u8 c) {
  return (c >= ' ' && c <= '~');
}

typedef struct StrDecode {
	u32 codepoint;
	u32 size;
} StrDecode;

fn StrDecode strDecodeUTF8(u8 *string, u32 cap){
	u8 length[] = {
		1, 1, 1, 1, // 000xx
		1, 1, 1, 1,
		1, 1, 1, 1,
		1, 1, 1, 1,
		0, 0, 0, 0, // 100xx
		0, 0, 0, 0,
		2, 2, 2, 2, // 110xx
		3, 3,       // 1110x
		4,          // 11110
		0,          // 11111
	};
	u8 first_byte_mask[] = { 0, 0x7F, 0x1F, 0x0F, 0x07 };
	u8 final_shift[] = { 0, 18, 12, 6, 0 };

	StrDecode result = {0};
	if (cap > 0){
		result.codepoint = '#';
		result.size = 1;

		u8 byte = string[0];
		u8 l = length[byte >> 3];
		if (0 < l && l <= cap){
			u32 cp = (byte & first_byte_mask[l]) << 18;
			switch (l){
				case 4: cp |= ((string[3] & 0x3F) << 0);
				case 3: cp |= ((string[2] & 0x3F) << 6);
				case 2: cp |= ((string[1] & 0x3F) << 12);
				default: break;
			}
			cp >>= final_shift[l];

			result.codepoint = cp;
			result.size = l;
		}
	}

	return result;
}

fn u32 strEncodeUTF8(u8 *dst, u32 codepoint){
	u32 size = 0;
	if (codepoint < (1 << 8)) {
		dst[0] = codepoint;
		size = 1;
	} else if (codepoint < (1 << 11)) {
		dst[0] = 0xC0 | (codepoint >> 6);
		dst[1] = 0x80 | (codepoint & 0x3F);
		size = 2;
	}
	else if (codepoint < (1 << 16)) {
		dst[0] = 0xE0 | (codepoint >> 12);
		dst[1] = 0x80 | ((codepoint >> 6) & 0x3F);
		dst[2] = 0x80 | (codepoint & 0x3F);
		size = 3;
	} else if (codepoint < (1 << 21)) {
		dst[0] = 0xF0 | (codepoint >> 18);
		dst[1] = 0x80 | ((codepoint >> 12) & 0x3F);
		dst[2] = 0x80 | ((codepoint >> 6) & 0x3F);
		dst[3] = 0x80 | (codepoint & 0x3F);
		size = 4;
	} else {
		dst[0] = '#';
		size = 1;
	}
	return size;
}

fn StrDecode strDecodeUTF16(u16 *str, u32 cap){
	StrDecode result = {'#', 1};
	u16 x = str[0];
	if (x < 0xD800 || 0xDFFF < x) {
		result.codepoint = x;
	} else if (cap >= 2) {
		u16 y = str[1];
		if (0xD800 <= x && x < 0xDC00 &&
			0xDC00 <= y && y < 0xE000
		) {
			u16 xj = x - 0xD800;
			u16 yj = y - 0xDc00;
			u32 xy = (xj << 10) | yj;
			result.codepoint = xy + 0x10000;
			result.size = 2;
		}
	}
	return result;
}

fn u32 strEncodeUTF16(u16 *dst, u32 codepoint){
	u32 size = 0;
	if (codepoint < 0x10000) {
		dst[0] = codepoint;
		size = 1;
	} else {
		u32 cpj = codepoint - 0x10000;
		dst[0] = (cpj >> 10) + 0xD800;
		dst[1] = (cpj & 0x3FF) + 0xDC00;
		size = 2;
	}
	return(size);
}

fn StringUTF16Const str16FromStr8(Arena* arena, String string) {
	u16* memory = arenaAllocArray(arena, u16, string.length * 2 + 1);

	u16* dptr = memory;
	u8* ptr = (u8*)string.bytes;
	u8* opl = (u8*)string.bytes + string.length;
	for (; ptr < opl;){
		StrDecode decode = strDecodeUTF8(ptr, (u64)(opl - ptr));
		u32 enc_size = strEncodeUTF16(dptr, decode.codepoint);
		ptr += decode.size;
		dptr += enc_size;
	}

	*dptr = 0;

	u64 alloc_count = string.length*2 + 1;
	u64 string_count = (u64)(dptr - memory);
	u64 unused_count = alloc_count - string_count - 1;
	arenaDealloc(arena, unused_count * sizeof(*memory));

	StringUTF16Const result = { memory, string_count };
	return result;
}