git.zerfleddert.de Git - proxmark3-svn/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org>
	3	*
	4	* Jansson is free software; you can redistribute it and/or modify
	5	* it under the terms of the MIT license. See LICENSE for details.
	6	*/
	7
	8	#include <string.h>
	9	#include "utf.h"
	10
	11	int utf8_encode(int32_t codepoint, char buffer, size_t size)
	12	{
	13	if(codepoint < 0)
	14	return -1;
	15	else if(codepoint < 0x80)
	16	{
	17	buffer[0] = (char)codepoint;
	18	*size = 1;
	19	}
	20	else if(codepoint < 0x800)
	21	{
	22	buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
	23	buffer[1] = 0x80 + ((codepoint & 0x03F));
	24	*size = 2;
	25	}
	26	else if(codepoint < 0x10000)
	27	{
	28	buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
	29	buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
	30	buffer[2] = 0x80 + ((codepoint & 0x003F));
	31	*size = 3;
	32	}
	33	else if(codepoint <= 0x10FFFF)
	34	{
	35	buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
	36	buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
	37	buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
	38	buffer[3] = 0x80 + ((codepoint & 0x00003F));
	39	*size = 4;
	40	}
	41	else
	42	return -1;
	43
	44	return 0;
	45	}
	46
	47	size_t utf8_check_first(char byte)
	48	{
	49	unsigned char u = (unsigned char)byte;
	50
	51	if(u < 0x80)
	52	return 1;
	53
	54	if(0x80 <= u && u <= 0xBF) {
	55	/* second, third or fourth byte of a multi-byte
	56	sequence, i.e. a "continuation byte" */
	57	return 0;
	58	}
	59	else if(u == 0xC0 \|\| u == 0xC1) {
	60	/* overlong encoding of an ASCII byte */
	61	return 0;
	62	}
	63	else if(0xC2 <= u && u <= 0xDF) {
	64	/* 2-byte sequence */
	65	return 2;
	66	}
	67
	68	else if(0xE0 <= u && u <= 0xEF) {
	69	/* 3-byte sequence */
	70	return 3;
	71	}
	72	else if(0xF0 <= u && u <= 0xF4) {
	73	/* 4-byte sequence */
	74	return 4;
	75	}
	76	else { /* u >= 0xF5 */
	77	/* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
	78	UTF-8 */
	79	return 0;
	80	}
	81	}
	82
	83	size_t utf8_check_full(const char buffer, size_t size, int32_t codepoint)
	84	{
	85	size_t i;
	86	int32_t value = 0;
	87	unsigned char u = (unsigned char)buffer[0];
	88
	89	if(size == 2)
	90	{
	91	value = u & 0x1F;
	92	}
	93	else if(size == 3)
	94	{
	95	value = u & 0xF;
	96	}
	97	else if(size == 4)
	98	{
	99	value = u & 0x7;
	100	}
	101	else
	102	return 0;
	103
	104	for(i = 1; i < size; i++)
	105	{
	106	u = (unsigned char)buffer[i];
	107
	108	if(u < 0x80 \|\| u > 0xBF) {
	109	/* not a continuation byte */
	110	return 0;
	111	}
	112
	113	value = (value << 6) + (u & 0x3F);
	114	}
	115
	116	if(value > 0x10FFFF) {
	117	/* not in Unicode range */
	118	return 0;
	119	}
	120
	121	else if(0xD800 <= value && value <= 0xDFFF) {
	122	/* invalid code point (UTF-16 surrogate halves) */
	123	return 0;
	124	}
	125
	126	else if((size == 2 && value < 0x80) \|\|
	127	(size == 3 && value < 0x800) \|\|
	128	(size == 4 && value < 0x10000)) {
	129	/* overlong encoding */
	130	return 0;
	131	}
	132
	133	if(codepoint)
	134	*codepoint = value;
	135
	136	return 1;
	137	}
	138
	139	const char utf8_iterate(const char buffer, size_t bufsize, int32_t *codepoint)
	140	{
	141	size_t count;
	142	int32_t value;
	143
	144	if(!bufsize)
	145	return buffer;
	146
	147	count = utf8_check_first(buffer[0]);
	148	if(count <= 0)
	149	return NULL;
	150
	151	if(count == 1)
	152	value = (unsigned char)buffer[0];
	153	else
	154	{
	155	if(count > bufsize \|\| !utf8_check_full(buffer, count, &value))
	156	return NULL;
	157	}
	158
	159	if(codepoint)
	160	*codepoint = value;
	161
	162	return buffer + count;
	163	}
	164
	165	int utf8_check_string(const char *string, size_t length)
	166	{
	167	size_t i;
	168
	169	for(i = 0; i < length; i++)
	170	{
	171	size_t count = utf8_check_first(string[i]);
	172	if(count == 0)
	173	return 0;
	174	else if(count > 1)
	175	{
	176	if(count > length - i)
	177	return 0;
	178
	179	if(!utf8_check_full(&string[i], count, NULL))
	180	return 0;
	181
	182	i += count - 1;
	183	}
	184	}
	185
	186	return 1;
	187	}