Implement JSON_ENSURE_ASCII encoding flag

With this flag, all Unicode characters outside the ASCII range are
escaped.
This commit is contained in:
Petri Lehtinen 2009-12-05 22:55:30 +02:00
parent d67aeb9739
commit 50031440a3
7 changed files with 150 additions and 24 deletions

View file

@ -519,6 +519,13 @@ can be ORed together to obtain *flags*.
.. versionadded:: 1.2
``JSON_ENSURE_ASCII``
If this flag is used, the output is guaranteed to consist only of
ASCII characters. This is achived by escaping all Unicode
characters outside the ASCII range.
.. versionadded:: 1.2
The following functions perform the actual JSON encoding. The result
is in UTF-8.

View file

@ -14,6 +14,7 @@
#include <jansson.h>
#include "jansson_private.h"
#include "strbuffer.h"
#include "utf.h"
#define MAX_INTEGER_STR_LENGTH 100
#define MAX_REAL_STR_LENGTH 100
@ -65,34 +66,49 @@ static int dump_indent(unsigned long flags, int depth, int space, dump_func dump
return 0;
}
static int dump_string(const char *str, dump_func dump, void *data)
static int dump_string(const char *str, int ascii, dump_func dump, void *data)
{
const char *end;
const char *pos, *end;
int32_t codepoint;
if(dump("\"", 1, data))
return -1;
end = str;
end = pos = str;
while(1)
{
const char *text;
char seq[7];
char seq[13];
int length;
while(*end && *end != '\\' && *end != '"' && (unsigned char)*end > 0x1F)
end++;
while(*end)
{
end = utf8_iterate(pos, &codepoint);
if(!end)
return -1;
if(end != str) {
if(dump(str, end - str, data))
/* mandatory escape or control char */
if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20)
break;
/* non-ASCII */
if(ascii && codepoint > 0x7F)
break;
pos = end;
}
if(pos != str) {
if(dump(str, pos - str, data))
return -1;
}
if(!*end)
if(end == pos)
break;
/* handle \, ", and control codes */
length = 2;
switch(*end)
switch(codepoint)
{
case '\\': text = "\\\\"; break;
case '\"': text = "\\\""; break;
@ -103,9 +119,27 @@ static int dump_string(const char *str, dump_func dump, void *data)
case '\t': text = "\\t"; break;
default:
{
sprintf(seq, "\\u00%02x", *end);
/* codepoint is in BMP */
if(codepoint < 0x10000)
{
sprintf(seq, "\\u%04x", codepoint);
length = 6;
}
/* not in BMP -> construct a UTF-16 surrogate pair */
else
{
int32_t first, last;
codepoint -= 0x10000;
first = 0xD800 | ((codepoint & 0xffc00) >> 10);
last = 0xDC00 | (codepoint & 0x003ff);
sprintf(seq, "\\u%04x\\u%04x", first, last);
length = 12;
}
text = seq;
length = 6;
break;
}
}
@ -113,8 +147,7 @@ static int dump_string(const char *str, dump_func dump, void *data)
if(dump(text, length, data))
return -1;
end++;
str = end;
str = pos = end;
}
return dump("\"", 1, data);
@ -123,6 +156,8 @@ static int dump_string(const char *str, dump_func dump, void *data)
static int do_dump(const json_t *json, unsigned long flags, int depth,
dump_func dump, void *data)
{
int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0;
switch(json_typeof(json)) {
case JSON_NULL:
return dump("null", 4, data);
@ -158,7 +193,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
}
case JSON_STRING:
return dump_string(json_string_value(json), dump, data);
return dump_string(json_string_value(json), ascii, dump, data);
case JSON_ARRAY:
{
@ -238,7 +273,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
{
void *next = json_object_iter_next((json_t *)json, iter);
dump_string(json_object_iter_key(iter), dump, data);
dump_string(json_object_iter_key(iter), ascii, dump, data);
if(dump(separator, separator_length, data) ||
do_dump(json_object_iter_value(iter), flags, depth + 1,
dump, data))

View file

@ -141,8 +141,9 @@ json_t *json_loads(const char *input, json_error_t *error);
json_t *json_loadf(FILE *input, json_error_t *error);
json_t *json_load_file(const char *path, json_error_t *error);
#define JSON_INDENT(n) (n & 0xFF)
#define JSON_COMPACT 0x100
#define JSON_INDENT(n) (n & 0xFF)
#define JSON_COMPACT 0x100
#define JSON_ENSURE_ASCII 0x200
char *json_dumps(const json_t *json, unsigned long flags);
int json_dumpf(const json_t *json, FILE *output, unsigned long flags);

View file

@ -149,7 +149,7 @@ static char stream_get(stream_t *stream, json_error_t *error)
for(i = 1; i < count; i++)
stream->buffer[i] = stream->get(stream->data);
if(!utf8_check_full(stream->buffer, count))
if(!utf8_check_full(stream->buffer, count, NULL))
goto out;
stream->stream_pos += count;

View file

@ -80,7 +80,7 @@ int utf8_check_first(char byte)
}
}
int utf8_check_full(const char *buffer, int size)
int utf8_check_full(const char *buffer, int size, int32_t *codepoint)
{
int i;
int32_t value = 0;
@ -130,9 +130,38 @@ int utf8_check_full(const char *buffer, int size)
return 0;
}
if(codepoint)
*codepoint = value;
return 1;
}
const char *utf8_iterate(const char *buffer, int32_t *codepoint)
{
int count;
int32_t value;
if(!*buffer)
return buffer;
count = utf8_check_first(buffer[0]);
if(count <= 0)
return NULL;
if(count == 1)
value = (unsigned char)buffer[0];
else
{
if(!utf8_check_full(buffer, count, &value))
return NULL;
}
if(codepoint)
*codepoint = value;
return buffer + count;
}
int utf8_check_string(const char *string, int length)
{
int i;
@ -150,7 +179,7 @@ int utf8_check_string(const char *string, int length)
if(i + count > length)
return 0;
if(!utf8_check_full(&string[i], count))
if(!utf8_check_full(&string[i], count, NULL))
return 0;
i += count - 1;

View file

@ -11,7 +11,8 @@
int utf8_encode(int codepoint, char *buffer, int *size);
int utf8_check_first(char byte);
int utf8_check_full(const char *buffer, int size);
int utf8_check_full(const char *buffer, int size, int32_t *codepoint);
const char *utf8_iterate(const char *buffer, int32_t *codepoint);
int utf8_check_string(const char *string, int length);

View file

@ -131,8 +131,8 @@ static void test_compact()
#define INDENTED_COMPACT_OBJECT \
"{\n" \
" \"a\":1,\n" \
" \"b\":2\n" \
" \"a\":1,\n" \
" \"b\":2\n" \
"}"
#define INDENTED_COMPACT_ARRAY \
"[\n" \
@ -163,12 +163,65 @@ static void test_compact_indent()
json_decref(array);
}
static const char *test_ensure_ascii_data[][2] = {
/*
{ "input", "output" }
*/
/* ascii */
{ "foo", "foo" },
/* BMP */
{ "\xc3\xa4 \xc3\xb6 \xc3\xa5", "\\u00e4 \\u00f6 \\u00e5" },
{ "foo \xc3\xa4\xc3\xa5", "foo \\u00e4\\u00e5" },
{ "\xc3\xa4\xc3\xa5 foo", "\\u00e4\\u00e5 foo" },
{ "\xc3\xa4 foo \xc3\xa5", "\\u00e4 foo \\u00e5" },
/* non-BMP */
{ "clef g: \xf0\x9d\x84\x9e", "clef g: \\ud834\\udd1e" },
};
static void test_ensure_ascii()
{
int i;
int num_tests = sizeof(test_ensure_ascii_data) / sizeof(const char *) / 2;
for(i = 0; i < num_tests; i++) {
json_t *array, *string;
const char *input, *output;
char *result, *stripped;
input = test_ensure_ascii_data[i][0];
output = test_ensure_ascii_data[i][1];
array = json_array();
string = json_string(input);
if(!array || !string)
fail("unable to create json values");
json_array_append(array, string);
result = json_dumps(array, JSON_ENSURE_ASCII);
/* strip leading [" and trailing "] */
stripped = &result[2];
stripped[strlen(stripped) - 2] = '\0';
if(strcmp(stripped, output) != 0) {
free(result);
fail("the result of json_dumps is invalid");
}
free(result);
}
}
int main(void)
{
test_normal();
test_indent();
test_compact();
test_compact_indent();
test_ensure_ascii();
return 0;
}