While it's technically "safe" to include user supplied data in
json_encode()
serialized values. The fact that characters such as <>&'
remain as is means there room for some as-yet unidentified problem
either in the browser's rendering or (more likely) elsewhere in one's
codebase for this data to get into the wrong context and be executed.
To that end, the attached patch allows the caller to be paranoid about
their data and stipulate that <>&' should be encoded to hex references
instead. This doesn't stop a web developer from dropping that content
into an innerHTML of course, but it's one more rope holding the ship
together.
Obviously, since this adds five characters per pedantically escaped
character, it's not something you'd want on by default, so the normal
behavior would be to leave them alone.
echo json_encode("<foo>");
"<foo>"
echo json_encode("<foo>", JSON_HEX_TAG);
"\u003Cfoo\u003E"
echo json_encode("<foo bar='baz'>", JSON_HEX_TAG
| JSON_HEX_APOS);
"\u003Cfoo bar=\u0027baz\u0027\u003E"
If noone objects, I'll commit this in a week along with an MFH for 5.3
-Sara
Index: json.c
RCS file: /repository/pecl/json/json.c,v
retrieving revision 1.31
diff -u -p -r1.31 json.c
--- json.c 1 Oct 2007 15:25:01 -0000 1.31
+++ json.c 29 Nov 2007 19:01:34 -0000
@@ -32,6 +32,10 @@
static const char digits[] = "0123456789abcdef";
+#define PHP_JSON_HEX_TAG (1<<0)
+#define PHP_JSON_HEX_AMP (1<<1)
+#define PHP_JSON_HEX_APOS (1<<2)
- /* {{{ json_functions[]
- Every user visible function must have an entry in json_functions[].
@@ -43,6 +47,18 @@ const function_entry json_functions[] =
};
/* }}} */
- Every user visible function must have an entry in json_functions[].
+/* {{{ MINIT */
+static PHP_MINIT_FUNCTION(json)
+{
- REGISTER_LONG_CONSTANT("JSON_HEX_TAG", PHP_JSON_HEX_TAG, CONST_CS |
CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("JSON_HEX_AMP", PHP_JSON_HEX_AMP, CONST_CS |
CONST_PERSISTENT); - REGISTER_LONG_CONSTANT("JSON_HEX_APOS", PHP_JSON_HEX_APOS, CONST_CS |
CONST_PERSISTENT); - return SUCCESS;
+}
+/* }}} */ - /* {{{ json_module_entry
*/
zend_module_entry json_module_entry = {
@@ -51,7 +67,7 @@ zend_module_entry json_module_entry = {
#endif
"json",
json_functions,
- NULL,
- PHP_MINIT(json),
NULL,
NULL,
NULL,
@@ -78,8 +94,8 @@ PHP_MINFO_FUNCTION(json)
}
/* }}} */
-static void json_encode_r(smart_str *buf, zval *val TSRMLS_DC);
-static void json_escape_string(smart_str *buf, zstr s, int len,
zend_uchar type);
+static void json_encode_r(smart_str *buf, zval *val, int options
TSRMLS_DC);
+static void json_escape_string(smart_str *buf, zstr s, int len,
zend_uchar type, int options);
static int json_determine_array_type(zval *val TSRMLS_DC) / {{{ /
{
@@ -115,7 +131,7 @@ static int json_determine_array_type(zva
}
/ }}} */
-static void json_encode_array(smart_str *buf, zval *val TSRMLS_DC) /
{{{ */
+static void json_encode_array(smart_str *buf, zval *val, int options
TSRMLS_DC) / {{{ */
{
int i, r;
HashTable *myht;
@@ -172,7 +188,7 @@ static void json_encode_array(smart_str
need_comma = 1;
}
-
json_encode_r(buf, *data TSRMLS_CC);
-
json_encode_r(buf, *data, options TSRMLS_CC); } else if (r == 1) { if (i == HASH_KEY_IS_STRING || i == HASH_KEY_IS_UNICODE) {
@@ -187,10 +203,10 @@ static void json_encode_array(smart_str
need_comma = 1;
}
-
json_escape_string(buf, key, key_len - 1,
(i==HASH_KEY_IS_UNICODE)?IS_UNICODE:IS_STRING);
-
json_escape_string(buf, key, key_len - 1,
(i==HASH_KEY_IS_UNICODE)?IS_UNICODE:IS_STRING, options);
smart_str_appendc(buf, ':');
-
json_encode_r(buf, *data TSRMLS_CC);
-
json_encode_r(buf, *data, options TSRMLS_CC); } else { if (need_comma) { smart_str_appendc(buf, ',');
@@ -203,7 +219,7 @@ static void json_encode_array(smart_str
smart_str_appendc(buf, '"');
smart_str_appendc(buf, ':');
-
json_encode_r(buf, *data TSRMLS_CC);
-
json_encode_r(buf, *data, options TSRMLS_CC); } }
@@ -227,7 +243,7 @@ static void json_encode_array(smart_str
#define REVERSE16(us) (((us & 0xf) << 12) | (((us >> 4) & 0xf) << 8) |
(((us >> 8) & 0xf) << 4) | ((us >> 12) & 0xf))
-static void json_escape_string(smart_str buf, zstr s, int len,
zend_uchar type) / {{{ */
+static void json_escape_string(smart_str buf, zstr s, int len,
zend_uchar type, int options) / {{{ */
{
int pos = 0;
unsigned short us;
@@ -305,6 +321,42 @@ static void json_escape_string(smart_str
smart_str_appendl(buf, "\t", 2);
}
break;
-
case '<':
-
{
-
if (options & PHP_JSON_HEX_TAG) {
-
smart_str_appendl(buf, "\\u003C", 6);
-
} else {
-
smary_str_appendc(buf, '<');
-
}
-
}
-
break;
-
case '>':
-
{
-
if (options & PHP_JSON_HEX_TAG) {
-
smart_str_appendl(buf, "\\u003E", 6);
-
} else {
-
smary_str_appendc(buf, '>');
-
}
-
}
-
break;
-
case '&':
-
{
-
if (options & PHP_JSON_HEX_AMP) {
-
smart_str_appendl(buf, "\\u0026", 6);
-
} else {
-
smary_str_appendc(buf, '&');
-
}
-
}
-
break;
-
case '\'':
-
{
-
if (options & PHP_JSON_HEX_APOS) {
-
smart_str_appendl(buf, "\\u0027", 6);
-
} else {
-
smary_str_appendc(buf, '\'');
-
}
-
}
-
break; default: { if (us >= ' ' && (us & 127) == us)
@@ -337,7 +389,7 @@ static void json_escape_string(smart_str
}
/* }}} */
-static void json_encode_r(smart_str *buf, zval val TSRMLS_DC) / {{{ */
+static void json_encode_r(smart_str *buf, zval val, int options
TSRMLS_DC) / {{{ */
{
switch (Z_TYPE_P(val)) {
case IS_NULL:
@@ -374,11 +426,11 @@ static void json_encode_r(smart_str *buf
break;
case IS_STRING:
case IS_UNICODE:
-
json_escape_string(buf, Z_UNIVAL_P(val), Z_UNILEN_P(val),
Z_TYPE_P(val));
-
json_escape_string(buf, Z_UNIVAL_P(val), Z_UNILEN_P(val),
Z_TYPE_P(val), options);
break;
case IS_ARRAY:
case IS_OBJECT:
-
json_encode_array(buf, &val TSRMLS_CC);
-
json_encode_array(buf, &val, options TSRMLS_CC); break; default: zend_error(E_WARNING, "[json] (json_encode_r) type is unsupported,
encoded as null.");
@@ -396,12 +448,13 @@ PHP_FUNCTION(json_encode)
{
zval *parameter;
smart_str buf = {0};
- long options = 0;
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", ¶meter)
== FAILURE) {
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z|l",
¶meter, &options) == FAILURE) {
return;
}
- json_encode_r(&buf, parameter TSRMLS_CC);
-
json_encode_r(&buf, parameter, options TSRMLS_CC);
/*
- Return as binary string, since the result is 99% likely to be just
is it really needed?
str_replace can easily do that…
it just looks like this "unidentified problem" is not specific to json
While it's technically "safe" to include user supplied data in
json_encode()
serialized values. The fact that characters such as <>&'
remain as is means there room for some as-yet unidentified problem
either in the browser's rendering or (more likely) elsewhere in one's
codebase for this data to get into the wrong context and be executed.To that end, the attached patch allows the caller to be paranoid about
their data and stipulate that <>&' should be encoded to hex references
instead. This doesn't stop a web developer from dropping that content
into an innerHTML of course, but it's one more rope holding the ship
together.Obviously, since this adds five characters per pedantically escaped
character, it's not something you'd want on by default, so the normal
behavior would be to leave them alone.echo json_encode("<foo>");
"<foo>"echo json_encode("<foo>", JSON_HEX_TAG);
"\u003Cfoo\u003E"echo json_encode("<foo bar='baz'>",
JSON_HEX_TAG
| JSON_HEX_APOS);
"\u003Cfoo bar=\u0027baz\u0027\u003E"If noone objects, I'll commit this in a week along with an MFH for 5.3
-Sara
Index: json.c
RCS file: /repository/pecl/json/json.c,v
retrieving revision 1.31
diff -u -p -r1.31 json.c
--- json.c 1 Oct 2007 15:25:01 -0000 1.31
+++ json.c 29 Nov 2007 19:01:34 -0000
@@ -32,6 +32,10 @@static const char digits[] = "0123456789abcdef";
+#define PHP_JSON_HEX_TAG (1<<0)
+#define PHP_JSON_HEX_AMP (1<<1)
+#define PHP_JSON_HEX_APOS (1<<2)
- /* {{{ json_functions[]
- Every user visible function must have an entry in json_functions[].
@@ -43,6 +47,18 @@ const function_entry json_functions[] =
};
/* }}} */+/* {{{ MINIT */
+static PHP_MINIT_FUNCTION(json)
+{
REGISTER_LONG_CONSTANT("JSON_HEX_TAG", PHP_JSON_HEX_TAG, CONST_CS |
CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_HEX_AMP", PHP_JSON_HEX_AMP, CONST_CS |
CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_HEX_APOS", PHP_JSON_HEX_APOS, CONST_CS |
CONST_PERSISTENT);
return SUCCESS;
+}
+/* }}} */
- /* {{{ json_module_entry
*/
zend_module_entry json_module_entry = {
@@ -51,7 +67,7 @@ zend_module_entry json_module_entry = {
#endif
"json",
json_functions,
NULL,
PHP_MINIT(json), NULL, NULL, NULL,
@@ -78,8 +94,8 @@ PHP_MINFO_FUNCTION(json)
}
/* }}} */-static void json_encode_r(smart_str *buf, zval *val TSRMLS_DC);
-static void json_escape_string(smart_str *buf, zstr s, int len,
zend_uchar type);
+static void json_encode_r(smart_str *buf, zval *val, int options
TSRMLS_DC);
+static void json_escape_string(smart_str *buf, zstr s, int len,
zend_uchar type, int options);static int json_determine_array_type(zval *val TSRMLS_DC) / {{{ /
{
@@ -115,7 +131,7 @@ static int json_determine_array_type(zva
}
/ }}} */-static void json_encode_array(smart_str *buf, zval *val TSRMLS_DC) /
{{{ */
+static void json_encode_array(smart_str *buf, zval *val, int options
TSRMLS_DC) / {{{ */
{
int i, r;
HashTable *myht;
@@ -172,7 +188,7 @@ static void json_encode_array(smart_str
need_comma = 1;
}
json_encode_r(buf, *data TSRMLS_CC);
json_encode_r(buf, *data, options TSRMLS_CC); } else if (r == 1) { if (i == HASH_KEY_IS_STRING || i == HASH_KEY_IS_UNICODE) {
@@ -187,10 +203,10 @@ static void json_encode_array(smart_str
need_comma = 1;
}
json_escape_string(buf, key, key_len - 1,
(i==HASH_KEY_IS_UNICODE)?IS_UNICODE:IS_STRING);
json_escape_string(buf, key, key_len - 1,
(i==HASH_KEY_IS_UNICODE)?IS_UNICODE:IS_STRING, options);
smart_str_appendc(buf, ':');
json_encode_r(buf, *data TSRMLS_CC);
json_encode_r(buf, *data, options TSRMLS_CC); } else { if (need_comma) { smart_str_appendc(buf, ',');
@@ -203,7 +219,7 @@ static void json_encode_array(smart_str
smart_str_appendc(buf, '"');
smart_str_appendc(buf, ':');
json_encode_r(buf, *data TSRMLS_CC);
json_encode_r(buf, *data, options TSRMLS_CC); } }
@@ -227,7 +243,7 @@ static void json_encode_array(smart_str
#define REVERSE16(us) (((us & 0xf) << 12) | (((us >> 4) & 0xf) << 8) |
(((us >> 8) & 0xf) << 4) | ((us >> 12) & 0xf))-static void json_escape_string(smart_str buf, zstr s, int len,
zend_uchar type) / {{{ */
+static void json_escape_string(smart_str buf, zstr s, int len,
zend_uchar type, int options) / {{{ */
{
int pos = 0;
unsigned short us;
@@ -305,6 +321,42 @@ static void json_escape_string(smart_str
smart_str_appendl(buf, "\t", 2);
}
break;
case '<':
{
if (options & PHP_JSON_HEX_TAG) {
smart_str_appendl(buf, "\\u003C", 6);
} else {
smary_str_appendc(buf, '<');
}
}
break;
case '>':
{
if (options & PHP_JSON_HEX_TAG) {
smart_str_appendl(buf, "\\u003E", 6);
} else {
smary_str_appendc(buf, '>');
}
}
break;
case '&':
{
if (options & PHP_JSON_HEX_AMP) {
smart_str_appendl(buf, "\\u0026", 6);
} else {
smary_str_appendc(buf, '&');
}
}
break;
case '\'':
{
if (options & PHP_JSON_HEX_APOS) {
smart_str_appendl(buf, "\\u0027", 6);
} else {
smary_str_appendc(buf, '\'');
}
}
break; default: { if (us >= ' ' && (us & 127) == us)
@@ -337,7 +389,7 @@ static void json_escape_string(smart_str
}
/* }}} */-static void json_encode_r(smart_str *buf, zval val TSRMLS_DC) / {{{ */
+static void json_encode_r(smart_str *buf, zval val, int options
TSRMLS_DC) / {{{ */
{
switch (Z_TYPE_P(val)) {
case IS_NULL:
@@ -374,11 +426,11 @@ static void json_encode_r(smart_str *buf
break;
case IS_STRING:
case IS_UNICODE:
json_escape_string(buf, Z_UNIVAL_P(val), Z_UNILEN_P(val),
Z_TYPE_P(val));
json_escape_string(buf, Z_UNIVAL_P(val), Z_UNILEN_P(val),
Z_TYPE_P(val), options);
break;
case IS_ARRAY:
case IS_OBJECT:
json_encode_array(buf, &val TSRMLS_CC);
json_encode_array(buf, &val, options TSRMLS_CC); break; default: zend_error(E_WARNING, "[json] (json_encode_r) type is unsupported,
encoded as null.");
@@ -396,12 +448,13 @@ PHP_FUNCTION(json_encode)
{
zval *parameter;
smart_str buf = {0};
long options = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", ¶meter)
== FAILURE) {
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z|l",
¶meter, &options) == FAILURE) {
return;
}
json_encode_r(&buf, parameter TSRMLS_CC);
json_encode_r(&buf, parameter, options TSRMLS_CC); /* * Return as binary string, since the result is 99% likely to be just
--
--
Alexey Zakhlestin
http://blog.milkfarmsoft.com/
str_replace() can fix the problem, but when you have a series of
nested arrays and objects, solving that problem with a separate loop
becomes onerous.
-Sara
P.S. - Sorry about the tripple post before...
Alexey Zakhlestin wrote:
is it really needed?
str_replace can easily do that…it just looks like this "unidentified problem" is not specific to json
While it's technically "safe" to include user supplied data in
json_encode()
serialized values. The fact that characters such as <>&'
remain as is means there room for some as-yet unidentified problem
either in the browser's rendering or (more likely) elsewhere in one's
codebase for this data to get into the wrong context and be executed.To that end, the attached patch allows the caller to be paranoid about
their data and stipulate that <>&' should be encoded to hex references
instead. This doesn't stop a web developer from dropping that content
into an innerHTML of course, but it's one more rope holding the ship
together.Obviously, since this adds five characters per pedantically escaped
character, it's not something you'd want on by default, so the normal
behavior would be to leave them alone.echo json_encode("<foo>");
"<foo>"echo json_encode("<foo>", JSON_HEX_TAG);
"\u003Cfoo\u003E"echo json_encode("<foo bar='baz'>",
JSON_HEX_TAG
| JSON_HEX_APOS);
"\u003Cfoo bar=\u0027baz\u0027\u003E"If noone objects, I'll commit this in a week along with an MFH for 5.3
-Sara
Index: json.c
RCS file: /repository/pecl/json/json.c,v
retrieving revision 1.31
diff -u -p -r1.31 json.c
--- json.c 1 Oct 2007 15:25:01 -0000 1.31
+++ json.c 29 Nov 2007 19:01:34 -0000
@@ -32,6 +32,10 @@static const char digits[] = "0123456789abcdef";
+#define PHP_JSON_HEX_TAG (1<<0)
+#define PHP_JSON_HEX_AMP (1<<1)
+#define PHP_JSON_HEX_APOS (1<<2)
- /* {{{ json_functions[]
- Every user visible function must have an entry in json_functions[].
@@ -43,6 +47,18 @@ const function_entry json_functions[] =
};
/* }}} */+/* {{{ MINIT */
+static PHP_MINIT_FUNCTION(json)
+{
REGISTER_LONG_CONSTANT("JSON_HEX_TAG", PHP_JSON_HEX_TAG, CONST_CS |
CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_HEX_AMP", PHP_JSON_HEX_AMP, CONST_CS |
CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_HEX_APOS", PHP_JSON_HEX_APOS, CONST_CS |
CONST_PERSISTENT);
return SUCCESS;
+}
+/* }}} */
- /* {{{ json_module_entry
*/
zend_module_entry json_module_entry = {
@@ -51,7 +67,7 @@ zend_module_entry json_module_entry = {
#endif
"json",
json_functions,
NULL,
PHP_MINIT(json), NULL, NULL, NULL,
@@ -78,8 +94,8 @@ PHP_MINFO_FUNCTION(json)
}
/* }}} */-static void json_encode_r(smart_str *buf, zval *val TSRMLS_DC);
-static void json_escape_string(smart_str *buf, zstr s, int len,
zend_uchar type);
+static void json_encode_r(smart_str *buf, zval *val, int options
TSRMLS_DC);
+static void json_escape_string(smart_str *buf, zstr s, int len,
zend_uchar type, int options);static int json_determine_array_type(zval *val TSRMLS_DC) / {{{ /
{
@@ -115,7 +131,7 @@ static int json_determine_array_type(zva
}
/ }}} */-static void json_encode_array(smart_str *buf, zval *val TSRMLS_DC) /
{{{ */
+static void json_encode_array(smart_str *buf, zval *val, int options
TSRMLS_DC) / {{{ */
{
int i, r;
HashTable *myht;
@@ -172,7 +188,7 @@ static void json_encode_array(smart_str
need_comma = 1;
}
json_encode_r(buf, *data TSRMLS_CC);
json_encode_r(buf, *data, options TSRMLS_CC); } else if (r == 1) { if (i == HASH_KEY_IS_STRING || i == HASH_KEY_IS_UNICODE) {
@@ -187,10 +203,10 @@ static void json_encode_array(smart_str
need_comma = 1;
}
json_escape_string(buf, key, key_len - 1,
(i==HASH_KEY_IS_UNICODE)?IS_UNICODE:IS_STRING);
json_escape_string(buf, key, key_len - 1,
(i==HASH_KEY_IS_UNICODE)?IS_UNICODE:IS_STRING, options);
smart_str_appendc(buf, ':');
json_encode_r(buf, *data TSRMLS_CC);
json_encode_r(buf, *data, options TSRMLS_CC); } else { if (need_comma) { smart_str_appendc(buf, ',');
@@ -203,7 +219,7 @@ static void json_encode_array(smart_str
smart_str_appendc(buf, '"');
smart_str_appendc(buf, ':');
json_encode_r(buf, *data TSRMLS_CC);
json_encode_r(buf, *data, options TSRMLS_CC); } }
@@ -227,7 +243,7 @@ static void json_encode_array(smart_str
#define REVERSE16(us) (((us & 0xf) << 12) | (((us >> 4) & 0xf) << 8) |
(((us >> 8) & 0xf) << 4) | ((us >> 12) & 0xf))-static void json_escape_string(smart_str buf, zstr s, int len,
zend_uchar type) / {{{ */
+static void json_escape_string(smart_str buf, zstr s, int len,
zend_uchar type, int options) / {{{ */
{
int pos = 0;
unsigned short us;
@@ -305,6 +321,42 @@ static void json_escape_string(smart_str
smart_str_appendl(buf, "\t", 2);
}
break;
case '<':
{
if (options & PHP_JSON_HEX_TAG) {
smart_str_appendl(buf, "\\u003C", 6);
} else {
smary_str_appendc(buf, '<');
}
}
break;
case '>':
{
if (options & PHP_JSON_HEX_TAG) {
smart_str_appendl(buf, "\\u003E", 6);
} else {
smary_str_appendc(buf, '>');
}
}
break;
case '&':
{
if (options & PHP_JSON_HEX_AMP) {
smart_str_appendl(buf, "\\u0026", 6);
} else {
smary_str_appendc(buf, '&');
}
}
break;
case '\'':
{
if (options & PHP_JSON_HEX_APOS) {
smart_str_appendl(buf, "\\u0027", 6);
} else {
smary_str_appendc(buf, '\'');
}
}
break; default: { if (us >= ' ' && (us & 127) == us)
@@ -337,7 +389,7 @@ static void json_escape_string(smart_str
}
/* }}} */-static void json_encode_r(smart_str *buf, zval val TSRMLS_DC) / {{{ */
+static void json_encode_r(smart_str *buf, zval val, int options
TSRMLS_DC) / {{{ */
{
switch (Z_TYPE_P(val)) {
case IS_NULL:
@@ -374,11 +426,11 @@ static void json_encode_r(smart_str *buf
break;
case IS_STRING:
case IS_UNICODE:
json_escape_string(buf, Z_UNIVAL_P(val), Z_UNILEN_P(val),
Z_TYPE_P(val));
json_escape_string(buf, Z_UNIVAL_P(val), Z_UNILEN_P(val),
Z_TYPE_P(val), options);
break;
case IS_ARRAY:
case IS_OBJECT:
json_encode_array(buf, &val TSRMLS_CC);
json_encode_array(buf, &val, options TSRMLS_CC); break; default: zend_error(E_WARNING, "[json] (json_encode_r) type is unsupported,
encoded as null.");
@@ -396,12 +448,13 @@ PHP_FUNCTION(json_encode)
{
zval *parameter;
smart_str buf = {0};
long options = 0;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", ¶meter)
== FAILURE) {
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z|l",
¶meter, &options) == FAILURE) {
return;
}
json_encode_r(&buf, parameter TSRMLS_CC);
json_encode_r(&buf, parameter, options TSRMLS_CC); /* * Return as binary string, since the result is 99% likely to be just
To that end, the attached patch allows the caller to be paranoid about
their data and stipulate that <>&' should be encoded to hex references
instead. This doesn't stop a web developer from dropping that content
into an innerHTML of course, but it's one more rope holding the ship
together.
Can you explain when it's going to help? I.e. if the concern is that
somebody would stick it in the DOM as-is and have something like XSS
with these data, then encoding it as \u is not enough, as far as I
understand. If it's not the concern, then I'm not sure what are the use
case - when such encoding is necessary?
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
To that end, the attached patch allows the caller to be paranoid about
their data and stipulate that <>&' should be encoded to hex references
instead. This doesn't stop a web developer from dropping that content
into an innerHTML of course, but it's one more rope holding the ship
together.Can you explain when it's going to help? I.e. if the concern is that
somebody would stick it in the DOM as-is and have something like XSS
with these data, then encoding it as \u is not enough, as far as I
understand. If it's not the concern, then I'm not sure what are the use
case - when such encoding is necessary?
You're absolutely correct that this won't save us from brain-dead
engineers, what it will save us from is broken browsers which
misinterpret otherwise legitimate data and get broken out of their
proper context. (Yes, I've seen browsers do exactly this, and you can
probably guess which versions)
-Sara
Short version, broken browsers have made me bitter and untrusting.
You're absolutely correct that this won't save us from brain-dead
engineers, what it will save us from is broken browsers which
misinterpret otherwise legitimate data and get broken out of their
proper context. (Yes, I've seen browsers do exactly this, and you can
probably guess which versions)
Could you explain in which context? I.e. say I had a broken browser, and
some JSON data having <foo> inside it - in which context that could get
me into trouble?
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
if we want to
<a onclick="process(<?php echo json_encode($stringValue); ?>)">
we should do
echo htmlspecialchars(json_encode($stringValue)) instead actually, and
yes JSON_HEX_TAG
will help avoiding htmlspecialchars()
just like
urlencode()
ed data which never contains < > or so.
i'm not sure if there is problem if you put json_encode()
ed data in
Stanislav Malyshev wrote:
You're absolutely correct that this won't save us from brain-dead
engineers, what it will save us from is broken browsers which
misinterpret otherwise legitimate data and get broken out of their
proper context. (Yes, I've seen browsers do exactly this, and you can
probably guess which versions)Could you explain in which context? I.e. say I had a broken browser, and
some JSON data having <foo> inside it - in which context that could get
me into trouble?
I can't because I don't know of any successful vectors currently. I
also would have sworn that echoing htmlentified data was safe....until I
came across a browser where it wasn't.
Since I can't show you a known problem that this will absolutely fix,
I'm not suggesting that default behavior be so paranoiac. I'm just
suggesting that users have the option to apply belt-and-suspenders logic
if they feel so inclined.
That said, after yesterday's post I realized that it should be safe
enough to do:
$str = str_replace(array('<','>',"'", '&'),
array('\u003C','\u003E','\u0026','\u0027'), json_encode($data));
That feels terribly hacky though and I'd really rather perform the
escaping inside json_encode()
while acting on the actual value.... just
seems more appropriate to me...
-Sara
I can't because I don't know of any successful vectors currently. I
also would have sworn that echoing htmlentified data was safe....until I
came across a browser where it wasn't.
So that's what I wanted to understand, because if we add this feature,
we should give some explanation on when to use it and what it does, and
I don't think I understand that, so I guess it would help to have such
explanation.
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
Stanislav Malyshev wrote:
I can't because I don't know of any successful vectors currently. I
also would have sworn that echoing htmlentified data was safe....until
I came across a browser where it wasn't.So that's what I wanted to understand, because if we add this feature,
we should give some explanation on when to use it and what it does, and
I don't think I understand that, so I guess it would help to have such
explanation.
Stuff like this often isn't completely deterministic. The attack
vectors will move around and new ones will be discovered but since the
syntax Sara is proposing is completely valid JSON it gives people
another tool. Documenting specific attack vectors is useful too, of
course, but a secondary concern in my mind.
I don't think we have ever documented some of the vectors against
htmlentities()
, for example. Even with the latest character encoding
fixes, there are still contextual attack vectors where doing
htmlentities()
on user data doesn't help you at all. For the curious,
try this:
<?php $foo = htmlspecialchars($_GET['foo'], ENT_QUOTES);?>
<a href="" onmouseover="a='<?php echo $foo?>';">Mouse Over Me</a>
Then try hitting the page and set ?foo=';alert(0);//
This doesn't mean there is anything wrong with htmlentities()
, of
course, it simply means it was used in the wrong context and another
mechanism is needed here.
I don't think it is hard to imagine that there are times when it would
be nice to be able to move JSON data around in a context in which html
tags and quotes might be inconvenient. Instead of applying a filter on
top of it, having a version of json that doesn't have these is quite useful.
-Rasmus
Is such filtering specific to JSON? Does it have some use out of JSON-context?
Maybe it would be better to provide a set of functions for encoding
characters into '\u'-entities? (similiar to htmlentities,
htmlspecialchars)
because if we speak of 'theoretical' problem, we might end
reimplementing this for some other function later
Stanislav Malyshev wrote:
I can't because I don't know of any successful vectors currently. I
also would have sworn that echoing htmlentified data was safe....until
I came across a browser where it wasn't.So that's what I wanted to understand, because if we add this feature,
we should give some explanation on when to use it and what it does, and
I don't think I understand that, so I guess it would help to have such
explanation.Stuff like this often isn't completely deterministic. The attack
vectors will move around and new ones will be discovered but since the
syntax Sara is proposing is completely valid JSON it gives people
another tool. Documenting specific attack vectors is useful too, of
course, but a secondary concern in my mind.I don't think we have ever documented some of the vectors against
htmlentities()
, for example. Even with the latest character encoding
fixes, there are still contextual attack vectors where doing
htmlentities()
on user data doesn't help you at all. For the curious,
try this:<?php $foo = htmlspecialchars($_GET['foo'], ENT_QUOTES);?>
<a href="" onmouseover="a='<?php echo $foo?>';">Mouse Over Me</a>Then try hitting the page and set ?foo=';alert(0);//
This doesn't mean there is anything wrong with
htmlentities()
, of
course, it simply means it was used in the wrong context and another
mechanism is needed here.I don't think it is hard to imagine that there are times when it would
be nice to be able to move JSON data around in a context in which html
tags and quotes might be inconvenient. Instead of applying a filter on
top of it, having a version of json that doesn't have these is quite useful.-Rasmus
--
--
Alexey Zakhlestin
http://blog.milkfarmsoft.com/
Alexey Zakhlestin wrote:
Is such filtering specific to JSON? Does it have some use out of JSON-context?
Maybe it would be better to provide a set of functions for encoding
characters into '\u'-entities? (similiar to htmlentities,
htmlspecialchars)because if we speak of 'theoretical' problem, we might end
reimplementing this for some other function later
The \u syntax is specific to JSON, yes.
-Rasmus
The \u syntax is specific to JSON, yes.
\u syntax is specific to javascripts string literals, regular
expressions and identifiers[1]
And JSON is not the only way to deliver data into javascript. Manual
approaches are still useful
[1] - http://developer.mozilla.org/en/docs/Core_JavaScript_1.5_Guide:Unicode#Unicode_escape_sequences
Alexey Zakhlestin
http://blog.milkfarmsoft.com/
Alexey Zakhlestin wrote:
The \u syntax is specific to JSON, yes.
\u syntax is specific to javascripts string literals, regular
expressions and identifiers[1]
And JSON is not the only way to deliver data into javascript. Manual
approaches are still useful
Since JSON and Javascript are synonymous, sure, \u is for javascript
string literals. I thought you meant whether it was useful outside of
Javascript.
And I disagree with your second statement. Why wouldn't you use json
anytime you wanted to jump from PHP to Javascript?
That ensures that no matter what $a is on the PHP side, it will be
correctly assigned to the corresponding Javascript variable.
-Rasmus
Stuff like this often isn't completely deterministic. The attack
vectors will move around and new ones will be discovered but since the
syntax Sara is proposing is completely valid JSON it gives people
another tool. Documenting specific attack vectors is useful too, of
course, but a secondary concern in my mind.
I'm not talking about attack vectors and full security analysis. For me,
it is a primary concern having some security oriented feature to know
what exactly it does and when you should and should not use it. We
were burned repeatedly by implementing various cool features that were
misused for doing things that they weren't meant to do and then we were
blamed for it - so I think we need to have clear understanding of when
and why this feature is useful and explicitly document it. Otherwise
what would happen is that people would use this option, pass JS data
through it, stick it into DOM, get XSS and start blogging about "huge
XSS in supposedly secure json_encode()
function in PHP".
Or, not seeing how it can help them, won't use it at all.
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
Stanislav Malyshev wrote:
Stuff like this often isn't completely deterministic. The attack
vectors will move around and new ones will be discovered but since the
syntax Sara is proposing is completely valid JSON it gives people
another tool. Documenting specific attack vectors is useful too, of
course, but a secondary concern in my mind.I'm not talking about attack vectors and full security analysis. For me,
it is a primary concern having some security oriented feature to know
what exactly it does and when you should and should not use it. We
were burned repeatedly by implementing various cool features that were
misused for doing things that they weren't meant to do and then we were
blamed for it - so I think we need to have clear understanding of when
and why this feature is useful and explicitly document it. Otherwise
what would happen is that people would use this option, pass JS data
through it, stick it into DOM, get XSS and start blogging about "huge
XSS in supposedly securejson_encode()
function in PHP".
Or, not seeing how it can help them, won't use it at all.
This is just a different way of encoding Javascript which depending on
the context of use will enable Javascript to be embedded securely. Not
providing an alternate encoding is a bit like arguing that we shouldn't
have base64_encode()
because if used incorrectly it could be insecure.
We don't have an explanation of when base64_encode()
is useful in the
docs, although I suppose we could come up with some scenarios for when
you want to squeeze arbitrary data into the set of characters
base64_encode()
uses. Same thing for this json_encode()
feature. We
can come up with a set of scenarios where we would like to avoid having
characters that are meaningful in XML and HTML show up in our json
strings.
-Rasmus
This is just a different way of encoding Javascript which depending on
the context of use will enable Javascript to be embedded securely. Not
providing an alternate encoding is a bit like arguing that we shouldn't
havebase64_encode()
because if used incorrectly it could be insecure.
I'm not saying "not providing", I'm saying "we should provide use cases,
otherwise this feature will inevitably be misused".
We don't have an explanation of when
base64_encode()
is useful in the
Because it's established standard that is widely used. json_encode()
option was never used before.
base64_encode()
uses. Same thing for thisjson_encode()
feature. We
can come up with a set of scenarios where we would like to avoid having
characters that are meaningful in XML and HTML show up in our json
strings.
OK, we can. Let's do.
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
One thing to consider is changing json_encode to add a header
Content-type: application/json (or x-javascript), unless the additional
arguments are used..
That way someone using the function to intermingle with HTML will be
faced with the fact they have to encode the output, otherwise it breaks
the page...
Regards
Alan
Stanislav Malyshev wrote:
This is just a different way of encoding Javascript which depending on
the context of use will enable Javascript to be embedded securely. Not
providing an alternate encoding is a bit like arguing that we shouldn't
havebase64_encode()
because if used incorrectly it could be insecure.I'm not saying "not providing", I'm saying "we should provide use
cases, otherwise this feature will inevitably be misused".We don't have an explanation of when
base64_encode()
is useful in theBecause it's established standard that is widely used.
json_encode()
option was never used before.
base64_encode()
uses. Same thing for thisjson_encode()
feature. We
can come up with a set of scenarios where we would like to avoid having
characters that are meaningful in XML and HTML show up in our json
strings.OK, we can. Let's do.
Alan Knowles wrote:
One thing to consider is changing json_encode to add a header
Content-type: application/json (or x-javascript), unless the additional
arguments are used..
That way someone using the function to intermingle with HTML will be
faced with the fact they have to encode the output, otherwise it breaks
the page...
Now that sounds downright disruptive, and all the tutorials will say:
Use json_encode('foobar', NO_HEADERS) because the other way does weird
stuff to PHP scripts. :-)
--
Edward Z. Yang GnuPG: 0x869C48DA
HTML Purifier http://htmlpurifier.org Anti-XSS Filter
[[ 3FA8 E9A9 7385 B691 A6FC B3CB A933 BE7D 869C 48DA ]]
One thing to consider is changing json_encode to add a header
Content-type: application/json (or x-javascript), unless the additional
arguments are used..
That way someone using the function to intermingle with HTML will be
faced with the fact they have to encode the output, otherwise it breaks
the page...
ob_iconv_handler() does something similar to this and I consider it a
mistake as:
ob_start('ob_iconv_handler');
echo "Foo";
ob_flush()
;
echo "Bar";
Will result in a headers already sent message, even though no explicit
attempt is made to send headers.
I've been meaning to come up with a BC fix for this... In time for 5.3
at least...
-Sara
I think I understand what the use-case is…
If the browser doesn't know anything about <script> tag, it is
supposed to interpret it's contents as html… In which case, html-tags
which appear in javascript code will actually become active ones
The common practice for dealing with this problem is using "comments"…
http://www.webdevout.net/articles/escaping-style-and-script-data
I still can't figure out any other usecase
--
Alexey Zakhlestin
http://blog.milkfarmsoft.com/
I think I understand what the use-case is…
If the browser doesn't know anything about <script> tag, it is
supposed to interpret it's contents as html… In which case, html-tags
which appear in javascript code will actually become active ones
This makes sense. Though wouldn't htmlspecialchars($json, ENT_NOQUOTES)
solve it?
But this use case makes sense, so I suppose if one uses json to output
the data inside HTML (as opposed to sending out in response to AJAX
request for example) one would want to use such option. Thanks for
finding this one!
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
Alexey Zakhlestin skrev:
I think I understand what the use-case is…
If the browser doesn't know anything about <script> tag, it is
supposed to interpret it's contents as html… In which case, html-tags
which appear in javascript code will actually become active ones
And in the year 2007, soon 2008, what browser would that be?
MSIE 2.0? (JScript was introduced in 3.0)
Netscape 2.0B1? (JavaScript was introduced i B2)
Mosaic?
$10 to the first one who can name a browser that has a market share
above 0.1% and will mistreat the contents in a script tag in this way.
Nah, I'm generous, I'll make that €10!
Lars Gunther
Keryx Web wrote:
Alexey Zakhlestin skrev:
I think I understand what the use-case is…
If the browser doesn't know anything about <script> tag, it is
supposed to interpret it's contents as html… In which case, html-tags
which appear in javascript code will actually become active onesAnd in the year 2007, soon 2008, what browser would that be?
MSIE 2.0? (JScript was introduced in 3.0)
Netscape 2.0B1? (JavaScript was introduced i B2)
Mosaic?
$10 to the first one who can name a browser that has a market share
above 0.1% and will mistreat the contents in a script tag in this way.
Nah, I'm generous, I'll make that €10!
It may not be the browser that misses the <script> tag. It may be the
developer. And I don't mean the trivial case of doing <script><?php
echo json_encode(...)</script>, this is more likely to happen in complex
environments where you have XHR requests returning json and doing DOM
manipulation on it. If you mess up and end up passing the returned json
payload to innerHTML, you are hosed. Using the \u syntax, even if you
mess up and that blob of data finds its way to an innerHTML, nothing
nasty can happen. Basically this is a more robust context-protected way
of encoding json. We should probably have done it this way right from
the beginning, but since we didn't and since we don't really want to
deal with the potential BC issues of changing working code, we have to
add it as an option at this point.
-Rasmus
Rasmus Lerdorf skrev:
It may not be the browser that misses the <script> tag. It may be the
developer.
To make things clear. My intention was not to question the value in
escaping data or even to give an opinion whatsoever about Sara's patch.
I was only replying to Alexey's notion that one should use HTML-comments
together with JavaScript, which is an old and entirely outdated technique.
That means my remark may be considered off topic, but since the issue
had been raised I thought it might be of interest to set things straight.
[Off topic rant]
I am constantly amazed that some PHP-wizards, whose knowledge of
back-end development clearly shows in how many ways I must still be
considered an newbie, are very unaware about what has been going in on
the front end this century, with regards to accessibility, standards,
unobtrusive scripting, semantic (X)HTML, CSS-based design, etc.
[/rant]
Lars Gunther
payload to innerHTML, you are hosed. Using the \u syntax, even if you
mess up and that blob of data finds its way to an innerHTML, nothing
nasty can happen. Basically this is a more robust context-protected way
I'm not sure this is correct - if you just write something like:
<script> var = <?php json_encode($_GET['pleasehackme']) ?>; myDomElement.innerHTML = var.content; </script>you are still in trouble, \u or not. Am I wrong?
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
payload to innerHTML, you are hosed. Using the \u syntax, even if you
mess up and that blob of data finds its way to an innerHTML, nothing
nasty can happen. Basically this is a more robust context-protected wayI'm not sure this is correct - if you just write something like:
<script> var = <?php json_encode($_GET['pleasehackme']) ?>; myDomElement.innerHTML = var.content; </script>you are still in trouble, \u or not. Am I wrong?
You are correct. But there are two traps here. One is the example
above. Just a bad programmer not taking basic steps to secure data
being dropped into a full HTML context. Nothing PHP can do can protect
the application developer from that.
The second is the one I'm trying to address, wherein data that belongs
in a JS parsing context may (coincidentally) contain HTML parsable data.
For whatever reason, this data may accidently be echoed outside of a
JS context, or a parsing/rendering error may lead to the browser
switching unexpectedly to an HTML context. By outputting \u00XX instead
of <>&', the data remains valid (and syntacticly unmodified) for JS
parsing, but becomes impotent against exploitability in an HTML context.
It's a good potential win, with the only loss being an increase in
payload for those otherwise benign characters. Personally, I'd like to
make this the default behavior, but given the payload increase I'm
offering a compromise of using an option to enable this behavior only
when the user asks for it, thus preserving BC.
-Sara
The second is the one I'm trying to address, wherein data that belongs
in a JS parsing context may (coincidentally) contain HTML parsable data.
For whatever reason, this data may accidently be echoed outside of a
JS context, or a parsing/rendering error may lead to the browser
switching unexpectedly to an HTML context. By outputting \u00XX instead
of <>&', the data remains valid (and syntacticly unmodified) for JS
parsing, but becomes impotent against exploitability in an HTML context.
Yes, I get it now, and with this in mind the feature indeed seems very
useful. Just document it well so people would have clear understanding
what it can and can't do - now that we seem to have good understanding
what it is.
Stanislav Malyshev, Zend Software Architect
stas@zend.com http://www.zend.com/
(408)253-8829 MSN: stas@zend.com
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Hi,
Rasmus Lerdorf wrote:
We should probably have done it this way right from
the beginning, but since we didn't and since we don't really want to
deal with the potential BC issues of changing working code, we have to
add it as an option at this point.
As long as the output follows the JSON specs, would there really be any
BC issues anyway? The \u syntax is part and has been part of the specs
since the beginning. Are these real world concerns?
thanks,
-
- Markus
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (MingW32)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org
- Markus
iD8DBQFHVwcM1nS0RcInK9ARApKRAJ0YIPBZYDzF9dGfo2f01kbaDuxtOACg1CYN
TPV8cbTac48uz/xEe1seEtI=
=FOW7
-----END PGP SIGNATURE