It was hinted in a comment to an answer to this question that PHP can not reverse Unicode strings.
As for Unicode, it works in PHP because most app
Here's another way. This seems to work without having to specify an output encoding (tested with a couple of different mb_internal_encoding
s):
function mb_strrev($text)
{
return join('', array_reverse(
preg_split('~~u', $text, -1, PREG_SPLIT_NO_EMPTY)
));
}
here's another approach using regex:
function utf8_strrev($str){
preg_match_all('/./us', $str, $ar);
return implode(array_reverse($ar[0]));
}
Another method:
function mb_strrev($str, $enc = null) {
if(is_null($enc)) $enc = mb_internal_encoding();
$str = mb_convert_encoding($str, 'UTF-16BE', $enc);
return mb_convert_encoding(strrev($str), $enc, 'UTF-16LE');
}
The answer
function mb_strrev($text, $encoding = null)
{
$funcParams = array($text);
if ($encoding !== null)
$funcParams[] = $encoding;
$length = call_user_func_array('mb_strlen', $funcParams);
$output = '';
$funcParams = array($text, $length, 1);
if ($encoding !== null)
$funcParams[] = $encoding;
while ($funcParams[1]--) {
$output .= call_user_func_array('mb_substr', $funcParams);
}
return $output;
}
Grapheme functions handle UTF-8 string more correctly than mbstring and PCRE functions/ Mbstring and PCRE may break characters. You can see the defference between them by executing the following code.
function str_to_array($string)
{
$length = grapheme_strlen($string);
$ret = [];
for ($i = 0; $i < $length; $i += 1) {
$ret[] = grapheme_substr($string, $i, 1);
}
return $ret;
}
function str_to_array2($string)
{
$length = mb_strlen($string, "UTF-8");
$ret = [];
for ($i = 0; $i < $length; $i += 1) {
$ret[] = mb_substr($string, $i, 1, "UTF-8");
}
return $ret;
}
function str_to_array3($string)
{
return preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY);
}
function utf8_strrev($string)
{
return implode(array_reverse(str_to_array($string)));
}
function utf8_strrev2($string)
{
return implode(array_reverse(str_to_array2($string)));
}
function utf8_strrev3($string)
{
return implode(array_reverse(str_to_array3($string)));
}
// http://www.php.net/manual/en/function.grapheme-strlen.php
$string = "a\xCC\x8A" // 'LATIN SMALL LETTER A WITH RING ABOVE' (U+00E5)
."o\xCC\x88"; // 'LATIN SMALL LETTER O WITH DIAERESIS' (U+00F6)
var_dump(array_map(function($elem) { return strtoupper(bin2hex($elem)); },
[
'should be' => "o\xCC\x88"."a\xCC\x8A",
'grapheme' => utf8_strrev($string),
'mbstring' => utf8_strrev2($string),
'pcre' => utf8_strrev3($string)
]));
The result is here.
array(4) {
["should be"]=>
string(12) "6FCC8861CC8A"
["grapheme"]=>
string(12) "6FCC8861CC8A"
["mbstring"]=>
string(12) "CC886FCC8A61"
["pcre"]=>
string(12) "CC886FCC8A61"
}
IntlBreakIterator can be used since PHP 5.5 (intl 3.0);
function utf8_strrev($str)
{
$it = IntlBreakIterator::createCodePointInstance();
$it->setText($str);
$ret = '';
$pos = 0;
$prev = 0;
foreach ($it as $pos) {
$ret = substr($str, $prev, $pos - $prev) . $ret;
$prev = $pos;
}
return $ret;
}
It is easy utf8_strrev( $str )
. See the relevant source code of my Library that I copied below:
function utf8_strrev( $str )
{
return implode( array_reverse( utf8_split( $str ) ) );
}
function utf8_split( $str , $split_length = 1 )
{
$str = ( string ) $str;
$ret = array( );
if( pcre_utf8_support( ) )
{
$str = utf8_clean( $str );
$ret = preg_split('/(?<!^)(?!$)/u', $str );
// \X is buggy in many recent versions of PHP
//preg_match_all( '/\X/u' , $str , $ret );
//$ret = $ret[0];
}
else
{
//Fallback
$len = strlen( $str );
for( $i = 0 ; $i < $len ; $i++ )
{
if( ( $str[$i] & "\x80" ) === "\x00" )
{
$ret[] = $str[$i];
}
else if( ( ( $str[$i] & "\xE0" ) === "\xC0" ) && ( isset( $str[$i+1] ) ) )
{
if( ( $str[$i+1] & "\xC0" ) === "\x80" )
{
$ret[] = $str[$i] . $str[$i+1];
$i++;
}
}
else if( ( ( $str[$i] & "\xF0" ) === "\xE0" ) && ( isset( $str[$i+2] ) ) )
{
if( ( ( $str[$i+1] & "\xC0" ) === "\x80" ) && ( ( $str[$i+2] & "\xC0" ) === "\x80" ) )
{
$ret[] = $str[$i] . $str[$i+1] . $str[$i+2];
$i = $i + 2;
}
}
else if( ( ( $str[$i] & "\xF8" ) === "\xF0" ) && ( isset( $str[$i+3] ) ) )
{
if( ( ( $str[$i+1] & "\xC0" ) === "\x80" ) && ( ( $str[$i+2] & "\xC0" ) === "\x80" ) && ( ( $str[$i+3] & "\xC0" ) === "\x80" ) )
{
$ret[] = $str[$i] . $str[$i+1] . $str[$i+2] . $str[$i+3];
$i = $i + 3;
}
}
}
}
if( $split_length > 1 )
{
$ret = array_chunk( $ret , $split_length );
$ret = array_map( 'implode' , $ret );
}
if( $ret[0] === '' )
{
return array( );
}
return $ret;
}
function utf8_clean( $str , $remove_bom = false )
{
$regx = '/([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF7][\x80-\xBF]{3})|./s';
$str = preg_replace( $regx , '$1' , $str );
if( $remove_bom )
{
$str = utf8_str_replace( utf8_bom( ) , '' , $str );
}
return $str;
}
function utf8_str_replace( $search , $replace , $subject , &$count = 0 )
{
return str_replace( $search , $replace , $subject , $count );
}
function utf8_bom( )
{
return "\xef\xbb\xbf";
}
function pcre_utf8_support( )
{
static $support;
if( !isset( $support ) )
{
$support = @preg_match( '//u', '' );
//Cached the response
}
return $support;
}