Newsgroups: php.internals Path: news.php.net Xref: news.php.net php.internals:128 Return-Path: Mailing-List: contact internals-help@lists.php.net; run by ezmlm Delivered-To: mailing list internals@lists.php.net Received: (qmail 37682 invoked from network); 21 Mar 2003 10:41:02 -0000 Received: from unknown (HELO pa.ktts.kharkov.ua) (193.124.76.197) by pb1.pair.com with SMTP; 21 Mar 2003 10:41:02 -0000 Received: from 7-10.fake.pa.net (7-10.fake.pa.net [192.168.23.80]) by pa.ktts.kharkov.ua (8.11.6/8.11.6) with SMTP id h2LAeJm10108 for ; Fri, 21 Mar 2003 12:40:22 +0200 Date: Fri, 21 Mar 2003 12:41:05 +0200 To: internals@lists.php.net Message-ID: <20030321124105.6a71c972.tony2001@phpclub.net> X-Mailer: Sylpheed version 0.8.11 (GTK+ 1.2.10; i686-pc-linux-gnu) Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="Multipart_Fri__21_Mar_2003_12:41:05_+0200_08598ce0" Subject: suggested patch for ext/standard/html.c for correct htmlentity'ing of cyrillic characters From: tony2001@phpclub.net (Antony Dovgal) --Multipart_Fri__21_Mar_2003_12:41:05_+0200_08598ce0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Hello, all. Attached patch provides cyrillic character sets support to htmlentities(), useful for those who wants to see correct cyrillic letters after using this function (at this moment htmlentities() successfully breaks cyrillic characters). KOI8-R, Windows-1251 & CP866 encodings are supported. I've tested this patch with PHP-CVS & PHP 4.3.2RC1 - it seems to be working correctly. Someone who has enough karma - please, take a look at it and apply it if it's ok. -- Wbr, Antony Dovgal aka tony2001 mailto:tony2001@phpclub.net http://phpclub.net --- Stand for something or you will fall for nothing. --Multipart_Fri__21_Mar_2003_12:41:05_+0200_08598ce0 Content-Type: text/plain; name="diff.txt" Content-Disposition: attachment; filename="diff.txt" Content-Transfer-Encoding: 7bit --- html.c Thu Mar 20 09:51:08 2003 +++ /root/CVS/php5/ext/standard/html.c Fri Mar 21 00:36:10 2003 @@ -46,7 +46,9 @@ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, - cs_big5hkscs, cs_sjis, cs_eucjp}; + cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, + cs_cp1251, cs_8859_5, cs_cp866 + }; typedef const char *entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -255,6 +257,64 @@ "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" }; +static entity_table_t ent_koi8r[] = { + "#1105", /* "jo "*/ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", + "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", + "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", + "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", + "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", + "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", + "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", + "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", + "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", + "#1066" +}; + +static entity_table_t ent_cp_1251[] = { + "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", + "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", + "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", + "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", + "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", + "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", + "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", + "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", + "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", + "#1103" +}; + +static entity_table_t ent_iso_8859_5[] = { + "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", + "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", + "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", + "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", + "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", + "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", + "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", + "#1119" +}; + +static entity_table_t ent_cp_866[] = { + + "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", + "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", + "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", + "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", + "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", + "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", + "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", + "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", + "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", + "#160" +}; + + struct html_entity_map { enum entity_charset charset; /* charset identifier */ unsigned short basechar; /* char code at start of table */ @@ -281,6 +341,10 @@ { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_koi8r, 0xa3, 0xff, ent_koi8r }, + { cs_cp1251, 0xc0, 0xff, ent_cp_1251 }, + { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, + { cs_cp866, 0xc0, 0xff, ent_cp_866 }, { cs_terminator } }; @@ -306,6 +370,17 @@ { "932", cs_sjis }, { "EUCJP", cs_eucjp }, { "EUC-JP", cs_eucjp }, + { "KOI8-R", cs_koi8r }, + { "koi8-ru", cs_koi8r }, + { "koi8r", cs_koi8r }, + { "cp1251", cs_cp1251 }, + { "Windows-1251", cs_cp1251 }, + { "win-1251", cs_cp1251 }, + { "iso8859-5", cs_8859_5 }, + { "iso-8859-5", cs_8859_5 }, + { "cp866", cs_cp866 }, + { "866", cs_cp866 }, + { "ibm866", cs_cp866 }, { NULL } }; @@ -643,7 +718,6 @@ if (charset_hint) { int found = 0; - /* now walk the charset map and look for the codeset */ for (i = 0; charset_map[i].codeset; i++) { if (strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { --Multipart_Fri__21_Mar_2003_12:41:05_+0200_08598ce0--