Newsgroups: php.internals Path: news.php.net Xref: news.php.net php.internals:10400 Return-Path: Mailing-List: contact internals-help@lists.php.net; run by ezmlm Delivered-To: mailing list internals@lists.php.net Received: (qmail 47655 invoked by uid 1010); 11 Jun 2004 12:39:46 -0000 Delivered-To: ezmlm-scan-internals@lists.php.net Delivered-To: ezmlm-internals@lists.php.net Received: (qmail 46727 invoked by uid 1007); 11 Jun 2004 12:39:36 -0000 To: internals@lists.php.net Date: Fri, 11 Jun 2004 15:37:32 +0300 References: Organization: none Content-Type: text/plain; format=flowed; delsp=yes; charset=koi8-r MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID: User-Agent: Opera M2/7.50 (Win32, build 3778) X-Posted-By: 217.23.116.150 Subject: my strip_tags() v2 From: valyala@tut.by ("Alexander Valyalkin") Some bugs have been fixed in this version: 1) All [#include] directives moved to the top. Just copy'n'compile sources to test it :) 2) Renamed php_strip_all_tags() to php_strip_tags() with the same interface as in the current version. 3) Fixed php_tag_find(). Allowable tags is case and order insensitive now 4) Added new test strings. Any comments & wishes are welcomed. ==================cut=================== #include #include #include #include /***************************************************/ /* test strings */ /***************************************************/ char *s[] = { "", /* empty string */ "a", /* one character */ "<", /* single < char */ "", /* single tag */ "ab", /* two chars */ "testee');", /* incomplete est", "test", "test", "test", "test

a

b", NULL }; char *allow = "
,trash
"; /* allowable tags */ /***************************************************/ #define PHPAPI #define PHP_MAX_HEREDOC_LEN 1 #define PHP_MAX_TAG_LEN 32 PHPAPI char *php_strtolower(char *s, size_t len) { unsigned char *c, *e; c = s; e = c+len; while (c < e) { *c = tolower(*c); c++; } return s; } /* {{{ php_tag_find Copies tag [tag_name_begin] with length [tag_name_ptr - tag_name_begin] to [dst_ptr] if it is in a set of allowable tags, pointed by [allow] with length [allow_len] */ void php_tag_find(char *allow, size_t allow_len, char *tag_name_begin, char *tag_name_ptr, char *src_ptr, char **dst_ptr) { size_t tag_len, pure_tag_len; char *tmp_ptr; int is_end_tag = 0; tag_len = tag_name_ptr - tag_name_begin; if (allow_len < 3 || src_ptr - *dst_ptr <= tag_len || tag_len < 1) return; if (*tag_name_begin == '/' && tag_len > 1) is_end_tag = 1; pure_tag_len = is_end_tag ? (tag_len - 1) : tag_len; static char tag_name[PHP_MAX_TAG_LEN + 3]; tag_name[0] = '<'; memcpy(tag_name + 1, is_end_tag ? (tag_name_begin + 1) : tag_name_begin, pure_tag_len); pure_tag_len++; tag_name[pure_tag_len++] = '>'; tag_name[pure_tag_len] = '\0'; tmp_ptr = strstr(allow, tag_name); if (tmp_ptr != NULL) { *(*dst_ptr)++ = '<'; memcpy(*dst_ptr, tag_name_begin, tag_len); *dst_ptr += tag_len; if (*(src_ptr - 2) == ' ') *(*dst_ptr)++ = ' '; if (*(src_ptr - 1) == '/') *(*dst_ptr)++ = '/'; *(*dst_ptr)++ = '>'; } } /* }}} */ PHPAPI size_t php_strip_tags(char *rbuf, int len, int *stateptr, char *allow, int allow_len) { char *src_begin = rbuf, *src_ptr = rbuf, *src_end = rbuf + (size_t) len; char *dst_ptr = rbuf; int state; if (stateptr != NULL) state = *stateptr; else state = 0; static char tag_name_begin[PHP_MAX_TAG_LEN + 1], *tag_name_ptr = NULL, *tag_name_end = NULL; size_t tag_len = 0; if (tag_name_ptr == NULL) tag_name_ptr = tag_name_begin; if (tag_name_end == NULL) tag_name_end = tag_name_begin + PHP_MAX_TAG_LEN; static char heredoc_name_begin[PHP_MAX_HEREDOC_LEN + 1], *heredoc_name_ptr = NULL, *heredoc_name_end = NULL; if (heredoc_name_ptr == NULL) heredoc_name_ptr = heredoc_name_begin; if (heredoc_name_end == NULL) heredoc_name_end = heredoc_name_begin + PHP_MAX_HEREDOC_LEN; char ch; php_strtolower(allow, allow_len); while (src_ptr < src_end) { ch = *src_ptr; switch (ch) { case '#' : switch (state) { case 4 : state = 18; break; } break; case '-' : switch (state) { case 10 : if ((src_ptr - src_begin) > 2 && *(src_ptr - 1) == '-' && *(src_ptr - 2) == '!' && *(src_ptr - 3) == '<') state = 9; break; } break; case '\r' : case '\n' : switch (state) { case 8 : case 18 : state = 4; break; case 10 : state = 1; break; } break; case ' ' : case '\t' : case '\v' : case '\f' : switch (state) { case 10 : state = 1; break; } break; case '*' : switch (state) { case 4 : if (*(src_ptr - 1) == '/') state = 7; break; } break; case '/' : switch (state) { case 4 : if (*(src_ptr - 1) == '/') state = 8; break; case 10 : if (*(src_ptr - 1) != '<') state = 1; break; } break; case '\\' : switch (state) { case 5 : case 6 : case 17 : if (src_ptr < src_end) src_ptr++; break; } break; case '%' : case '?' : switch (state) { case 10 : if (*(src_ptr - 1) == '<' && src_ptr + 1 < src_end && tolower(*(src_ptr + 1)) != 'x') { if (tag_name_ptr < tag_name_end) *tag_name_ptr++ = ch; state = 4; } break; } break; case '<' : switch (state) { case 0 : if (src_end - src_ptr > 1 && !isspace(*(src_ptr + 1))) state = 13; break; case 4 : if ((src_ptr + 2 < src_end) && *(src_ptr + 1) == '<' && *(src_ptr + 2) == '<') { state = 15; src_ptr += 2; } break; } break; case '>' : switch (state) { case 4 : case 8 : case 18 : if (*(src_ptr - 1) == *tag_name_begin) state = 14; break; case 1 : case 10 : if (tag_name_ptr - tag_name_begin == 6 && !memcmp(tag_name_begin, "script", 6)) state = 11; else if (tag_name_ptr - tag_name_begin == 5 && !memcmp(tag_name_begin, "style", 5)) state = 12; else { php_tag_find(allow, (size_t) allow_len, tag_name_begin, tag_name_ptr, src_ptr, &dst_ptr); state = 14; } break; } break; case '"' : switch (state) { case 1 : if (*(src_ptr - 1) == '=' || isspace(*(src_ptr - 1))) state = 2; break; case 4 : state = 5; break; case 5 : state = 4; break; } break; case '\'' : switch (state) { case 1 : if (*(src_ptr - 1) == '=' || isspace(*(src_ptr - 1))) state = 3; break; case 4 : state = 6; break; case 6 : state = 4; break; } break; case '`' : switch (state) { case 4 : state = 17; break; case 17 : state = 4; break; } break; } switch (state) { case 0 : *dst_ptr++ = ch; break; case 2 : src_ptr++; src_ptr = memchr(src_ptr, '"', src_end - src_ptr); if (src_ptr == NULL) src_ptr = src_end; else state = 1; break; case 3 : src_ptr++; src_ptr = memchr(src_ptr, '\'', src_end - src_ptr); if (src_ptr == NULL) src_ptr = src_end; else state = 1; break; case 7 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '*', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 2) src_ptr = src_end; else { src_ptr++; if (*src_ptr == '/') break; } } if (src_ptr < src_end) state = 4; break; case 9 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '-', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 3) src_ptr = src_end; else { src_ptr++; if (*src_ptr == '-' && *(src_ptr + 1) == '>') break; } } if (src_ptr < src_end) { src_ptr++; state = 0; } break; case 10 : if (tag_name_ptr < tag_name_end) *tag_name_ptr++ = tolower(ch); break; case 11 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '<', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 8) src_ptr = src_end; else { src_ptr++; if (src_ptr[0] == '/' && tolower(src_ptr[1]) == 's' && tolower(src_ptr[2]) == 'c' && tolower(src_ptr[3]) == 'r' && tolower(src_ptr[4]) == 'i' && tolower(src_ptr[5]) == 'p' && tolower(src_ptr[6]) == 't') break; } } if (src_ptr < src_end) { src_ptr += 6; tag_name_ptr = tag_name_begin; state = 1; } break; case 12 : src_ptr++; while (src_ptr < src_end) { src_ptr = memchr(src_ptr, '<', src_end - src_ptr); if (src_ptr == NULL || src_end - src_ptr < 7) src_ptr = src_end; else { src_ptr++; if (src_ptr[0] == '/' && tolower(src_ptr[1]) == 's' && tolower(src_ptr[2]) == 't' && tolower(src_ptr[3]) == 'y' && tolower(src_ptr[4]) == 'l' && tolower(src_ptr[5]) == 'e') break; } } if (src_ptr < src_end) { src_ptr += 5; tag_name_ptr = tag_name_begin; state = 1; } break; case 13 : tag_name_ptr = tag_name_begin; state = 10; break; case 14 : state = 0; break; case 15 : src_ptr++; heredoc_name_ptr = heredoc_name_begin; while (src_ptr < src_end && (*src_ptr == ' ' || *src_ptr == '\t')) src_ptr++; if (src_ptr < src_end) { while (src_ptr < src_end && heredoc_name_ptr < heredoc_name_end && isalnum(*src_ptr)) *heredoc_name_ptr++ = *src_ptr++; if (src_ptr < src_end && isalpha(*heredoc_name_begin)) { *heredoc_name_ptr++ = '\0'; src_ptr = strstr(src_ptr, heredoc_name_begin); if (src_ptr == NULL) { src_ptr = src_end; state = 16; } else { src_ptr += heredoc_name_ptr - heredoc_name_begin; state = 4; } } else state = 4; } break; case 16 : src_ptr = strstr(src_ptr, heredoc_name_begin); if (src_ptr == NULL) src_ptr = src_end; else { src_ptr += heredoc_name_ptr - heredoc_name_begin; state = 4; } break; } src_ptr++; } *dst_ptr = '\0'; if (stateptr != NULL) *stateptr = state; return (size_t) (dst_ptr - src_begin); } /***************************************************/ int main(int argc,char *argv[]) { int i = 0; char *s1, *allow1; size_t len_old, len_new, allow_len; int state; allow_len = strlen(allow); allow1 = (char *) malloc(allow_len + 1); memcpy(allow1, allow, allow_len + 1); s1 = (char *) malloc(1); len_old = 0; *s1 = '\0'; while (s[i] != NULL) { printf("str_num=%d, ", i); state = 0; /* set state to 0 */ len_new = strlen(s[i]); if (len_new > len_old) s1 = (char *) realloc(s1, len_new + 1); strcpy(s1, s[i]); // printf("src=[%s], ", s1); len_old = php_strip_tags(s1, len_new, &state, allow1, allow_len); printf("dst=[%s], src_len=%d, dst_len=%d, state=%d\n", s1, len_new, len_old, state); len_old = len_new; i++; } free(s1); free(allow1); return 0; } ==================cut===================