Submitting patch for Unicode impl of strtok(), since I don't have a
proper test setup, as yet.
-- Rolland
--- Rolland Santimano rollandsantimano@yahoo.com wrote:
Submitting patch for Unicode impl of
strtok(), since I don't have a
proper test setup, as yet.-- Rolland
OK, attachment got stripped - please check inline patch.
Index: ext/standard/basic_functions.h
RCS file: /repository/php-src/ext/standard/basic_functions.h,v
retrieving revision 1.140
diff -u -r1.140 basic_functions.h
--- ext/standard/basic_functions.h 13 Aug 2005 02:23:29 -0000 1.140
+++ ext/standard/basic_functions.h 9 Sep 2005 18:13:25 -0000
@@ -153,9 +153,9 @@
HashTable *user_shutdown_function_names;
HashTable putenv_ht;
zval *strtok_zval;
-
char *strtok_string;
-
void *strtok_string; char *locale_string;
-
char *strtok_last;
-
void *strtok_last; char strtok_table[256]; ulong strtok_len; char str_ebuf[40];
Index: ext/standard/string.c
RCS file: /repository/php-src/ext/standard/string.c,v
retrieving revision 1.481
diff -u -r1.481 string.c
--- ext/standard/string.c 8 Sep 2005 14:07:40 -0000 1.481
+++ ext/standard/string.c 9 Sep 2005 18:13:54 -0000
@@ -1315,88 +1315,168 @@
Tokenize a string */
PHP_FUNCTION(strtok)
{
-
zval **args[2]; -
zval **tok, **str; -
char *token; -
char *token_end; -
char *p; -
char *pe;
-
void *tok, *str; -
int32_t tok_len, str_len; -
zend_uchar tok_type, str_type; -
zval *zv; -
char *token, *token_end, *p, *pe; -
UChar *u_token, *u_p, *u_pe; -
UChar32 ch, th; -
int32_t start, end, i, j, rem_len; -
int delim_found, token_present; int skipped = 0;
-
if (ZEND_NUM_ARGS() < 1 || ZEND_NUM_ARGS() > 2 ||
zend_get_parameters_array_ex(ZEND_NUM_ARGS(), args) == FAILURE) {
-
if (ZEND_NUM_ARGS() < 1 || ZEND_NUM_ARGS() > 2) { WRONG_PARAM_COUNT; }
-
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "T|T", -
&str, &str_len, &str_type, -
&tok, &tok_len, &tok_type) == FAILURE) { -
return; -
} -
switch (ZEND_NUM_ARGS()) { case 1:
-
tok = args[0];
-
tok = str; -
tok_len = str_len; -
tok_type = str_type; break; default: case 2:
-
str = args[0]; -
tok = args[1]; -
convert_to_string_ex(str); -
zval_add_ref(str); if (BG(strtok_zval)) { zval_ptr_dtor(&BG(strtok_zval)); } -
BG(strtok_zval) = *str; -
BG(strtok_last) = BG(strtok_string) = Z_STRVAL_PP(str); -
BG(strtok_len) = Z_STRLEN_PP(str);
-
MAKE_STD_ZVAL(zv); -
if (str_type == IS_UNICODE) { -
ZVAL_UNICODEL(zv, (UChar *)str, str_len, 1); -
} else if (str_type == IS_BINARY) { -
ZVAL_BINARYL(zv, (char *)str, str_len, 1); -
} else { -
ZVAL_STRINGL(zv, (char *)str, str_len, 1); -
} -
BG(strtok_zval) = zv; -
if (str_type == IS_UNICODE) { -
BG(strtok_last) = BG(strtok_string) = Z_USTRVAL_P(zv); -
} else { -
BG(strtok_last) = BG(strtok_string) = Z_STRVAL_P(zv); -
} -
BG(strtok_len) = str_len; break; }
-
p = BG(strtok_last); /* Where we start to search */ -
pe = BG(strtok_string) + BG(strtok_len); -
if (!p || p >= pe) {
-
if (BG(strtok_zval) && tok_type != Z_TYPE_P(BG(strtok_zval))) { -
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Delimiter type must match string
type.");
RETURN_FALSE;
}
-
convert_to_string_ex(tok); -
token = Z_STRVAL_PP(tok); -
token_end = token + Z_STRLEN_PP(tok);
-
if (tok_type == IS_UNICODE) { -
u_p = (UChar *)BG(strtok_last); /* Where we start to search */ -
u_pe = (UChar *)BG(strtok_string) + BG(strtok_len); -
u_token = (UChar *)tok; -
if (!u_p || u_p >= u_pe) { -
RETURN_FALSE; -
} -
rem_len = u_pe - u_p;
-
while (token < token_end) { -
STRTOK_TABLE(token++) = 1; -
} -
/* Skip leading delimiters */ -
while (STRTOK_TABLE(p)) { -
if (++p >= pe) { -
/* no other chars left */
-
/* Skip leading delimiters */ -
token_present = 0; -
for (i = 0 ; (u_p + i) < u_pe ; ) { -
delim_found = 0; -
U16_NEXT(u_p, i, rem_len, ch); -
for (j = 0 ; j < tok_len ; ) { -
U16_NEXT(u_token, j, tok_len, th); -
if ( ch == th ) { -
delim_found = 1; -
break; -
} -
} -
if (delim_found == 0) { -
U16_BACK_1(u_p, 0, i); /* U16_NEXT() post-incrs 'i' */ -
start = i; -
token_present = 1; -
break; -
} -
} -
if (token_present == 0) { BG(strtok_last) = NULL;
-
RETVAL_FALSE; -
goto restore;
-
RETURN_FALSE; }
-
skipped++; -
} -
/* We know at this place that *p is no delimiter, so skip it */ -
while (++p < pe) { -
if (STRTOK_TABLE(p)) { -
goto return_token;
-
/* Seek to next delimiter */ -
delim_found = 0; -
for (i = start ; (u_p + i) < u_pe ; ) { -
U16_NEXT(u_p, i, rem_len, ch); -
for (j = 0 ; j < tok_len ; ) { -
U16_NEXT(u_token, j, tok_len, th); -
if ( ch == th ) { -
delim_found = 1; -
break; -
} -
} -
if (delim_found) { -
U16_BACK_1(u_p, 0, i); /* 'i' was beyond delimiter */ -
break; -
} -
} -
end = i; -
if (end - start) { -
BG(strtok_last) = u_p + end; -
RETURN_UNICODEL(u_p + start, end - start, 1); -
} else { -
BG(strtok_last) = NULL; -
RETURN_FALSE; }
-
} -
if (p - BG(strtok_last)) {
-return_token:
-
RETVAL_STRINGL(BG(strtok_last) + skipped, (p - BG(strtok_last)) - skipped, 1); -
BG(strtok_last) = p + 1; } else { -
RETVAL_FALSE; -
BG(strtok_last) = NULL; -
}
-
p = (char *)BG(strtok_last); /* Where we start to search */ -
pe = (char *)BG(strtok_string) + BG(strtok_len); -
if (!p || p >= pe) { -
RETURN_FALSE; -
} -
token = (char *)tok; -
token_end = token + tok_len; -
while (token < token_end) { -
STRTOK_TABLE(token++) = 1; -
}
-
/* Restore table -- usually faster then memset'ing the table on every invocation */
-
/* Skip leading delimiters */ -
while (STRTOK_TABLE(p)) { -
if (++p >= pe) { -
/* no other chars left */ -
BG(strtok_last) = NULL; -
RETVAL_FALSE; -
goto restore; -
} -
skipped++; -
} -
/* We know at this place that *p is no delimiter, so skip it */ -
while (++p < pe) { -
if (STRTOK_TABLE(p)) { -
goto return_token; -
} -
} -
if (p - (char *)BG(strtok_last)) {
+return_token:
-
if (tok_type == IS_BINARY) { -
RETVAL_BINARYL((char *)BG(strtok_last) + skipped, (p - (char
*)BG(strtok_last)) - skipped, 1);
-
} else { -
RETVAL_STRINGL((char *)BG(strtok_last) + skipped, (p - (char
*)BG(strtok_last)) - skipped, 1);
-
} -
BG(strtok_last) = p + 1; -
} else { -
RETVAL_FALSE; -
BG(strtok_last) = NULL; -
} -
/* Restore table -- usually faster then memset'ing the table on every invocation
*/
restore:
-
token = Z_STRVAL_PP(tok); -
while (token < token_end) { -
STRTOK_TABLE(token++) = 0;
-
token = (char *)tok; -
while (token < token_end) { -
STRTOK_TABLE(token++) = 0; -
} }
}
/* }}} */