(Updated a little)
I\'m not very experienced with internationalization using PHP, it must be said, and a deal of searching didn\'t really provide th
function is_gsm0338( $utf8_string ) {
$gsm0338 = array(
'@','Δ',' ','0','¡','P','¿','p',
'£','_','!','1','A','Q','a','q',
'$','Φ','"','2','B','R','b','r',
'¥','Γ','#','3','C','S','c','s',
'è','Λ','¤','4','D','T','d','t',
'é','Ω','%','5','E','U','e','u',
'ù','Π','&','6','F','V','f','v',
'ì','Ψ','\'','7','G','W','g','w',
'ò','Σ','(','8','H','X','h','x',
'Ç','Θ',')','9','I','Y','i','y',
"\n",'Ξ','*',':','J','Z','j','z',
'Ø',"\x1B",'+',';','K','Ä','k','ä',
'ø','Æ',',','<','L','Ö','l','ö',
"\r",'æ','-','=','M','Ñ','m','ñ',
'Å','ß','.','>','N','Ü','n','ü',
'å','É','/','?','O','§','o','à'
);
$len = mb_strlen( $utf8_string, 'UTF-8');
for( $i=0; $i < $len; $i++)
if (!in_array(mb_substr($utf8_string,$i,1,'UTF-8'), $gsm0338))
return false;
return true;
}
Although this is an old thread I recently had to solve a very similar problem and wanted to post my answer. The PHP code is somewhat simple. It starts with a painstakingly large array of GSM valid character codes in an array, then simply checks if the current character is in that array using the ord($string) function which returns the ascii value of the first character of the string passed. Here is the code I use to validate if a string is GSM worth.
$valid_gsm_keycodes = Array(
0x0040, 0x0394, 0x0020, 0x0030, 0x00a1, 0x0050, 0x00bf, 0x0070,
0x00a3, 0x005f, 0x0021, 0x0031, 0x0041, 0x0051, 0x0061, 0x0071,
0x0024, 0x03a6, 0x0022, 0x0032, 0x0042, 0x0052, 0x0062, 0x0072,
0x00a5, 0x0393, 0x0023, 0x0033, 0x0043, 0x0053, 0x0063, 0x0073,
0x00e8, 0x039b, 0x00a4, 0x0034, 0x0035, 0x0044, 0x0054, 0x0064, 0x0074,
0x00e9, 0x03a9, 0x0025, 0x0045, 0x0045, 0x0055, 0x0065, 0x0075,
0x00f9, 0x03a0, 0x0026, 0x0036, 0x0046, 0x0056, 0x0066, 0x0076,
0x00ec, 0x03a8, 0x0027, 0x0037, 0x0047, 0x0057, 0x0067, 0x0077,
0x00f2, 0x03a3, 0x0028, 0x0038, 0x0048, 0x0058, 0x0068, 0x0078,
0x00c7, 0x0398, 0x0029, 0x0039, 0x0049, 0x0059, 0x0069, 0x0079,
0x000a, 0x039e, 0x002a, 0x003a, 0x004a, 0x005a, 0x006a, 0x007a,
0x00d8, 0x001b, 0x002b, 0x003b, 0x004b, 0x00c4, 0x006b, 0x00e4,
0x00f8, 0x00c6, 0x002c, 0x003c, 0x004c, 0x00d6, 0x006c, 0x00f6,
0x000d, 0x00e6, 0x002d, 0x003d, 0x004d, 0x00d1, 0x006d, 0x00f1,
0x00c5, 0x00df, 0x002e, 0x003e, 0x004e, 0x00dc, 0x006e, 0x00fc,
0x00e5, 0x00c9, 0x002f, 0x003f, 0x004f, 0x00a7, 0x006f, 0x00e0 );
for($i = 0; $i < strlen($string); $i++) {
if(!in_array($string[$i], $valid_gsm_keycodes)) return false;
}
return true;
I know this isnt php code, but I think it might help anyway. This is how I do it in an app I wrote to detect if its possible to send as GSM 03.38 (you could do something similar for plain text). It has two translation tables, one for normal GSM and one for the extended. And then a function that loops through all characters checking if it can be converted.
#define UCS2_TO_GSM_LOOKUP_TABLE_SIZE 0x100
#define NON_GSM 0x80
#define UCS2_GCL_RANGE 24
#define UCS2_GREEK_CAPITAL_LETTER_ALPHA 0x0391
#define EXTEND 0x001B
// note that the ` character is mapped to ' so that all characters that can be typed on
// a standard north american keyboard can be converted to the GSM default character set
static unsigned char Ucs2ToGsm[UCS2_TO_GSM_LOOKUP_TABLE_SIZE] =
{ /*+0x0 +0x1 +0x2 +0x3 +0x4 +0x5 +0x6 +0x7*/
/*0x00*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0x08*/ NON_GSM, NON_GSM, 0x0a, NON_GSM, NON_GSM, 0x0d, NON_GSM, NON_GSM,
/*0x10*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0x18*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0x20*/ 0x20, 0x21, 0x22, 0x23, 0x02, 0x25, 0x26, 0x27,
/*0x28*/ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
/*0x30*/ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
/*0x38*/ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
/*0x40*/ 0x00, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
/*0x48*/ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
/*0x50*/ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
/*0x58*/ 0x58, 0x59, 0x5a, EXTEND, EXTEND, EXTEND, EXTEND, 0x11,
/*0x60*/ 0x27, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
/*0x68*/ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
/*0x70*/ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
/*0x78*/ 0x78, 0x79, 0x7a, EXTEND, EXTEND, EXTEND, EXTEND, NON_GSM,
/*0x80*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0x88*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0x90*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0x98*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0xa0*/ NON_GSM, 0x40, NON_GSM, 0x01, 0x24, 0x03, NON_GSM, 0x5f,
/*0xa8*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0xb0*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM,
/*0xb8*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, 0x60,
/*0xc0*/ NON_GSM, NON_GSM, NON_GSM, NON_GSM, 0x5b, 0x0e, 0x1c, 0x09,
/*0xc8*/ NON_GSM, 0x1f, NON_GSM, NON_GSM, NON_GSM, NON_GSM, NON_GSM, 0x60,
/*0xd0*/ NON_GSM, 0x5d, NON_GSM, NON_GSM, NON_GSM, NON_GSM, 0x5c, NON_GSM,
/*0xd8*/ 0x0b, NON_GSM, NON_GSM, NON_GSM, 0x5e, NON_GSM, NON_GSM, 0x1e,
/*0xe0*/ 0x7f, NON_GSM, NON_GSM, NON_GSM, 0x7b, 0x0f, 0x1d, NON_GSM,
/*0xe8*/ 0x04, 0x05, NON_GSM, NON_GSM, 0x07, NON_GSM, NON_GSM, NON_GSM,
/*0xf0*/ NON_GSM, 0x7d, 0x08, NON_GSM, NON_GSM, NON_GSM, 0x7c, NON_GSM,
/*0xf8*/ 0x0c, 0x06, NON_GSM, NON_GSM, 0x7e, NON_GSM, NON_GSM, NON_GSM
};
static unsigned char Ucs2GclToGsm[UCS2_GCL_RANGE + 1] =
{
/*0x0391*/ 0x41, // Alpha A
/*0x0392*/ 0x42, // Beta B
/*0x0393*/ 0x13, // Gamma
/*0x0394*/ 0x10, // Delta
/*0x0395*/ 0x45, // Epsilon E
/*0x0396*/ 0x5A, // Zeta Z
/*0x0397*/ 0x48, // Eta H
/*0x0398*/ 0x19, // Theta
/*0x0399*/ 0x49, // Iota I
/*0x039a*/ 0x4B, // Kappa K
/*0x039b*/ 0x14, // Lambda
/*0x039c*/ 0x4D, // Mu M
/*0x039d*/ 0x4E, // Nu N
/*0x039e*/ 0x1A, // Xi
/*0x039f*/ 0x4F, // Omicron O
/*0x03a0*/ 0X16, // Pi
/*0x03a1*/ 0x50, // Rho P
/*0x03a2*/ NON_GSM,
/*0x03a3*/ 0x18, // Sigma
/*0x03a4*/ 0x54, // Tau T
/*0x03a5*/ 0x59, // Upsilon Y
/*0x03a6*/ 0x12, // Phi
/*0x03a7*/ 0x58, // Chi X
/*0x03a8*/ 0x17, // Psi
/*0x03a9*/ 0x15 // Omega
};
bool Gsm0338Encoding::IsNotGSM( wchar_t szUnicodeChar )
{
bool result = true;
if( szUnicodeChar < UCS2_TO_GSM_LOOKUP_TABLE_SIZE )
{
result = ( Ucs2ToGsm[szUnicodeChar] == NON_GSM );
}
else if( (szUnicodeChar >= UCS2_GREEK_CAPITAL_LETTER_ALPHA) &&
(szUnicodeChar <= (UCS2_GREEK_CAPITAL_LETTER_ALPHA + UCS2_GCL_RANGE)) )
{
result = ( Ucs2GclToGsm[szUnicodeChar - UCS2_GREEK_CAPITAL_LETTER_ALPHA] == NON_GSM );
}
else if( szUnicodeChar == 0x20AC ) // €
{
result = false;
}
return result;
}
bool Gsm0338Encoding::IsGSM( const std::wstring& str )
{
bool result = true;
if( std::find_if( str.begin(), str.end(), IsNotGSM ) != str.end() )
{
result = false;
}
return result;
}
To deal with it conceptually before getting into mechanisms, and apologies if any of this is obvious, a string can be defined as a sequence of Unicode characters, Unicode being a database that gives an id number known as a code point to every character you might need to work with. GSM-338 contains a subset of the Unicode characters, so what you're doing is extracting a set of codepoints from your string, and checking to see if that set is contained in GSM-338.
// second column of http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
$gsm338_codepoints = array(0x0040, 0x0000, ..., 0x00fc, 0x00e0)
$can_use_gsm338 = true;
foreach(codepoints($mystring) as $codepoint){
if(!in_array($codepoint, $gsm338_codepoints)){
$can_use_gsm338 = false;
break;
}
}
That leaves the definition of the function codepoints($string), which isn't built in to PHP. PHP understands a string to be a sequence of bytes rather than a sequence of Unicode characters. The best way of bridging the gap is to get your strings into UTF8 as quickly as you can and keep them in UTF8 as long as you can - you'll have to use other encodings when dealing with external systems, but isolate the conversion to the interface to that system and deal only with utf8 internally.
The functions you need to convert between php strings in utf8 and sequences of codepoints can be found at http://hsivonen.iki.fi/php-utf8/ , so that's your codepoints() function.
If you're taking data from an external source that gives you Unicode slash-escaped characters ("Let's test \u00f6\u00e4\u00fc..."), that string escape format should be converted to utf8. I don't know offhand of a function to do this, if one can't be found, it's a matter of string/regex processing + the use of the hsivonen.iki.fi functions, for example when you hit \u00f6, replace it with the utf8 representation of the codepoint 0xf6.
PHP6 will have better unicode support but there are a few functions you can use.
My first thought was mb_convert_encoding but as you said this will shorten messages to 70 chars - so perhaps you can use this in conjunction with mb_detect_encoding?
See: Multibyte Functions
preg_match('/^[\x0A\x0C\x0D\x20-\x5F\x61-\x7E\xA0\xA1\xA3-\xA5\xA7'.
'\xBF\xC4-\xC6\xC9\xD1\xD6\xD8\xDC\xDF\xE0\xE4-\xE9\xEC\xF1'.
'\xF2\xF6\xF8\xF9\xFC'.
json_decode('"\u0393\u0394\u0398\u039B\u039E\u03A0\u03A3\u03A6\u03A8\u03A9\u20AC"').
']*$/u', $text)
or
preg_match('/^[\x0A\x0C\x0D\x20-\x5F\x61-\x7E\xA0\xA1\xA3-\xA5\xA7\xBF\xC4-\xC6\xC9\xD1\xD6\xD8\xDC\xDF\xE0\xE4-\xE9\xEC\xF1\xF2\xF6\xF8\xF9\xFCΓΔΘΛΞΠΣΦΨΩ€]*$/u', $text)