Newsgroups: php.internals Path: news.php.net Xref: news.php.net php.internals:10391 Return-Path: Mailing-List: contact internals-help@lists.php.net; run by ezmlm Delivered-To: mailing list internals@lists.php.net Received: (qmail 76281 invoked by uid 1010); 10 Jun 2004 18:08:19 -0000 Delivered-To: ezmlm-scan-internals@lists.php.net Delivered-To: ezmlm-internals@lists.php.net Received: (qmail 75314 invoked by uid 1007); 10 Jun 2004 18:08:10 -0000 To: internals@lists.php.net References: Message-ID: Date: Thu, 10 Jun 2004 21:07:49 +0300 Organization: none Content-Type: text/plain; format=flowed; delsp=yes; charset=koi8-r MIME-Version: 1.0 Content-Transfer-Encoding: 8bit User-Agent: Opera M2/7.50 (Win32, build 3778) X-Posted-By: 217.23.116.150 Subject: the new version of strip_tag() From: valyala@tut.by ("Alexander Valyalkin") On Thu, 10 Jun 2004 16:33:13 +0300, Alexander Valyalkin wrote: Today I wrote the new version of strip_tags(). Yes, it is not ideal, but it is much better than current version. Below is my complete version of strip_tags() with testcases. You can add / change any testcases and compare speed & results of current strip_tags() to my one. Sorry, but I stripped out majority of comments, because they were in russian :) ====================cut==================== /***************************************************/ /* test strings */ /***************************************************/ char *s[] = { "", /* empty string */ "a", /* one character */ "<", /* single < char */ "ab", /* two chars */ "testee');", /* incomplete est", "test", "test", "test", NULL }; char *allow = ""; /* allowable tags */ /***************************************************/ #include #include #include #include #define PHPAPI #define PHP_MAX_HEREDOC_LEN 32 #define PHP_MAX_TAG_LEN 32 void php_tag_find(char *allow, size_t allow_len, char *tag_name_begin, char *tag_name_ptr, char *src_ptr, char **dst_ptr) { size_t tag_len; char *tmp_ptr; size_t is_end_tag = 0; tag_len = tag_name_ptr - tag_name_begin; if (*tag_name_begin == '/' && tag_len > 1) is_end_tag = 1; *tag_name_ptr = '\0'; if (allow_len < 3 || src_ptr - *dst_ptr <= tag_len || tag_len < 1) return; tmp_ptr = strstr(allow + 1, is_end_tag ? tag_name_begin + 1 : tag_name_begin); if (tmp_ptr != NULL && tmp_ptr + tag_len - is_end_tag < allow + allow_len && *(tmp_ptr + tag_len - is_end_tag) == '>' && *(tmp_ptr - 1) == '<') { *(*dst_ptr)++ = '<'; memcpy(*dst_ptr, tag_name_begin, tag_len); *dst_ptr += tag_len; if (*(src_ptr - 2) == ' ') *(*dst_ptr)++ = ' '; if (*(src_ptr - 1) == '/') *(*dst_ptr)++ = '/'; *(*dst_ptr)++ = '>'; } } PHPAPI size_t php_strip_all_tags(char *rbuf, int len, int *stateptr, char *allow, int allow_len) { char *src_begin = rbuf, *src_ptr = rbuf, *src_end = rbuf + (size_t) len; char *dst_ptr = rbuf; int state; if (stateptr != NULL) state = *stateptr; else state = 0; static char tag_name_begin[PHP_MAX_TAG_LEN + 1], *tag_name_ptr = NULL, *tag_name_end = NULL; size_t tag_len = 0; if (tag_name_ptr == NULL) tag_name_ptr = tag_name_begin; if (tag_name_end == NULL) tag_name_end = tag_name_begin + PHP_MAX_TAG_LEN; static char heredoc_name_begin[PHP_MAX_HEREDOC_LEN + 1], *heredoc_name_ptr = NULL, *heredoc_name_end = NULL; if (heredoc_name_ptr == NULL) heredoc_name_ptr = heredoc_name_begin; if (heredoc_name_end == NULL) heredoc_name_end = heredoc_name_begin + PHP_MAX_HEREDOC_LEN; char ch; while (src_ptr < src_end) { ch = *src_ptr; switch (ch) { case '#' : switch (state) { case 4 : state = 18; break; } break; case '-' : switch (state) { case 10 : if ((src_ptr - src_begin) > 2 && *(src_ptr - 1) == '-' && *(src_ptr - 2) == '!' && *(src_ptr - 3) == '<') state = 9; break; } break; case '\r' : case '\n' : switch (state) { case 8 : case 18 : state = 4; break; case 10 : state = 1; break; } break; case ' ' : case '\t' : case '\v' : case '\f' : switch (state) { case 10 : state = 1; break; } break; case '*' : switch (state) { case 4 : if (*(src_ptr - 1) == '/') state = 7; break; } break; case '/' : switch (state) { case 4 : if (*(src_ptr - 1) == '/') state = 8; break; case 10 : if (*(src_ptr - 1) != '<') state = 1; break; } break; case '\\' : switch (state) { case 5 : case 6 : case 17 : if (src_ptr < src_end) src_ptr++; break; } break; case '%' : case '?' : switch (state) { case 10 : if (*(src_ptr - 1) == '<' && src_ptr + 1 < src_end && tolower(*(src_ptr + 1)) != 'x') { if (tag_name_ptr < tag_name_end) *tag_name_ptr++ = ch; state = 4; } break; } break; case '<' : switch (state) { case 0 : if (src_end - src_ptr > 1 && !isspace(*(src_ptr + 1))) state = 13; break; case 4 : if ((src_ptr + 2 < src_end) && *(src_ptr + 1) == '<' && *(src_ptr + 2) == '<') { state = 15; src_ptr += 2; } break; } break; case '>' : switch (state) { case 4 : case 8 : case 18 : if (*(src_ptr - 1) == *tag_name_begin) state = 14; break; case 1 : case 10 : if (tag_name_ptr - tag_name_begin == 6 && !memcmp(tag_name_begin, "script", 6)) state = 11; else if (tag_name_ptr - tag_name_begin == 5 && !memcmp(tag_name_begin, "style", 5)) state = 12; else { php_tag_find(allow, (size_t) allow_len, tag_name_begin, tag_name_ptr, src_ptr, &dst_ptr); state = 14; } break; } break; case '"' : switch (state) { case 1 : if (*(src_ptr - 1) == '=' || isspace(*(src_ptr - 1))) state = 2; break; case 4 : state = 5; break; case 5 : state = 4; break; } break; case '\'' : switch (state) { case 1 : if (*(src_ptr - 1) == '=' || isspace(*(src_ptr - 1))) state = 3; break; case 4 : state = 6; break; case 6 : state = 4; break; } break; case '`' : switch (state) { case 4 : state = 17; break; case 17 : state = 4; break; } break; } switch (state) { case 0 : *dst_ptr++ = ch; break; case 2 : src_ptr++; src_ptr = memchr(src_ptr, '"', src_end - src_ptr); if (src_ptr == NULL) src_ptr = src_end; else state = 1; break; case 3 : src_ptr++; src_ptr = memchr(src_ptr, '\'', src_end - src_ptr); if (src_ptr == NULL) src_ptr = src_end; else state = 1; break; case 7 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '*', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 2) src_ptr = src_end; else { src_ptr++; if (*src_ptr == '/') break; } } if (src_ptr < src_end) state = 4; break; case 9 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '-', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 3) src_ptr = src_end; else { src_ptr++; if (*src_ptr == '-' && *(src_ptr + 1) == '>') break; } } if (src_ptr < src_end) { src_ptr++; state = 0; } break; case 10 : if (tag_name_ptr < tag_name_end) *tag_name_ptr++ = tolower(ch); break; case 11 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '<', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 8) src_ptr = src_end; else { src_ptr++; if (src_ptr[0] == '/' && tolower(src_ptr[1]) == 's' && tolower(src_ptr[2]) == 'c' && tolower(src_ptr[3]) == 'r' && tolower(src_ptr[4]) == 'i' && tolower(src_ptr[5]) == 'p' && tolower(src_ptr[6]) == 't') break; } } if (src_ptr < src_end) { src_ptr += 6; tag_name_ptr = tag_name_end; state = 1; } break; case 12 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '<', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 7) src_ptr = src_end; else { src_ptr++; if (src_ptr[0] == '/' && tolower(src_ptr[1]) == 's' && tolower(src_ptr[2]) == 't' && tolower(src_ptr[3]) == 'y' && tolower(src_ptr[4]) == 'l' && tolower(src_ptr[5]) == 'e') break; } } if (src_ptr < src_end) { src_ptr += 5; tag_name_ptr = tag_name_end; state = 1; } break; case 13 : tag_name_ptr = tag_name_begin; state = 10; break; case 14 : state = 0; break; case 15 : src_ptr++; heredoc_name_ptr = heredoc_name_begin; while (src_ptr < src_end && (*src_ptr == ' ' || *src_ptr == '\t')) src_ptr++; if (src_ptr < src_end) { while (src_ptr < src_end && heredoc_name_ptr < heredoc_name_end && isalnum(*src_ptr)) *heredoc_name_ptr++ = *src_ptr++; if (src_ptr < src_end && isalpha(*heredoc_name_begin)) { *heredoc_name_ptr++ = '\0'; src_ptr = strstr(src_ptr, heredoc_name_begin); if (src_ptr == NULL) { src_ptr = src_end; state = 16; } else { src_ptr += heredoc_name_ptr - heredoc_name_begin; state = 4; } } else state = 4; } break; case 16 : src_ptr = strstr(src_ptr, heredoc_name_begin); if (src_ptr == NULL) src_ptr = src_end; else { src_ptr += heredoc_name_ptr - heredoc_name_begin; state = 4; } break; } src_ptr++; } *dst_ptr = '\0'; if (stateptr != NULL) *stateptr = state; return (size_t) (dst_ptr - src_begin); } /***************************************************/ int main(int argc,char *argv[]) { int i = 0; char *s1; size_t len_old, len_new, allow_len; int state; allow_len = strlen(allow); s1 = (char *) malloc(1); len_old = 0; *s1 = '\0'; while (s[i] != NULL) { printf("str_num=%d, ", i); state = 0; /* set state to 0 */ len_new = strlen(s[i]); if (len_new > len_old) s1 = (char *) realloc(s1, len_new + 1); strcpy(s1, s[i]); // printf("src=[%s], ", s1); len_old = php_strip_all_tags(s1, len_new, &state, allow, allow_len); printf("dst=[%s], src_len=%d, dst_len=%d, state=%d\n", s1, len_new, len_old, state); len_old = len_new; i++; } free(s1); return 0; } ====================cut==================== -- Using Opera's revolutionary e-mail client: http://www.opera.com/m2/