<?php
/*.
require_module 'standard';
require_module 'spl';
require_module 'pcre';
.*/
namespace it\icosaedro\utils;
use OutOfRangeException;
/**
* Utility functions for UTF-8 BPM string encoding. This class only provides
* very basic functions mostly intended to be used in others, higher level
* packages.
*
* WARNING. These functions do not check for the actual encoding of the
* passed strings and always assume blindly these strings are properly
* UTF-8 encoded strings. If arbitrary data are passed, unexpected results
* may arise.
*
* ATTENTION. In this document the term <i>byte</i> always refers to a
* single byte of a generic string; the term <i>character</i> refers to
* a single Unicode character, that may be encoded as a sequence of 1,
* 2 or 3 bytes; the term <i>codepoint</i> refers to the numerical code of
* a single Unicode character in the range [0,65535].
*
* @author Umberto Salsi <salsi@icosaedro.it>
* @version $Date: 2012/04/02 09:13:24 $
*/
class UTF8 {
/**
* Sanitizes the string removing invalid bytes. Invalid bytes, incomplete
* UTF-8 sequences, non-minimal sequences and invalid BMP codepoints
* are removed.
* @param string $s The string to sanitize, possibly NULL.
* @return string Properly encoded UTF-8 BMP string. If the subject string
* is NULL, NULL is returned as well.
*/
static function sanitize($s)
{
static $PATTERN = /*. (string) .*/ NULL;
if( $PATTERN === NULL ){
$C = "[\x80-\xBF]";
$PATTERN = "/("
. "[\x00-\x7F]"
. "|[\xC2-\xDF]$C"
. "|\xE0[\xA0-\xBF]$C"
. "|[\xE1-\xEF]$C$C"
. ")|(.)/";
}
if( $s === NULL )
return NULL;
return preg_replace($PATTERN, "\$1", $s);
}
/**
Returns the codepoint as UTF-8 string of bytes.
@param int $code Codepoint [0,65535].
@return string String of bytes that represents the given codepoint.
@throws OutOfRangeException If the codepoint is invalid.
*/
static function chr($code)
{
if( $code < 0 or $code > 65535 )
throw new OutOfRangeException("$code");
if( $code < 128 ){
$s = chr($code);
} else if( $code < 2048 ){
$s = chr(0xc0 | ($code >> 6))
. chr(0x80 | ($code & 0x3f));
} else {
$s = chr(0xe0 | ($code >> 12))
. chr(0x80 | ($code >> 6) & 0x3f)
. chr(0x80 | ($code & 0x3f));
}
return $s;
}
/**
Return true if the passed byte is the continuation byte of a
UTF-8 sequence.
@param int $b Subject byte.
@return bool True if the subject byte is the continuation byte
of a UTF-8 sequence.
*/
static function isCont($b)
{
return ($b & 0xc0) == 0x80;
}
/**
* Return the length of the UTF-8 sequence given its starting byte.
* Byte code ranges are as follows (by increasing code):
* <pre>
* [0x00,0x7f] 1 byte sequence (ASCII) returns 1
* [0x80,0xbf] continuation byte -- returns 0
* [0xc0,0xc1] unused byte codes -- returns 0
* [0xc2,0xdf] 2 bytes seq. starts -- returns 2
* [0xe0,0xef] 3 bytes seq. starts -- returns 3
* [0xf0,0xff] unused byte codes -- returns 0
* </pre>
* @param int $b First byte of the sequence in [0,255].
* @return int Length of the sequence in bytes, that is 1, 2 or 3.
* Returns 0 if the byte code is invalid or out of the range [0,255].
*/
static function sequenceLength($b)
{
if( $b < 0 ){
return 0;
} else if( $b <= 0x7f ){
return 1;
} else if( $b <= 0xc1 ){
return 0;
} else if( $b <= 0xdf ){
return 2;
} else if( $b <= 0xef ){
return 3;
} else {
return 0;
}
}
/**
* Returns the codepoint at a given position in a string.
* @param string $s UTF-8 encoded string.
* @param int $byte_index Byte index of the sequence.
* @return int The code of the codepoint.
* @throws OutOfRangeException If the index is invalid.
*/
static function codepointAtByteIndex($s, $byte_index)
{
if( $byte_index < 0 or $byte_index >= strlen($s) )
new OutOfRangeException("$byte_index");
$b1 = ord($s[$byte_index]);
if( ($b1 & 0x80) == 0 ){
return $b1;
} else if( ($b1 & 0xe0) == 0xc0 ){
$b2 = ord($s[$byte_index+1]);
return (($b1 & 0x1f) << 6) + ($b2 & 0x3f);
} else {
$b2 = ord($s[$byte_index+1]);
$b3 = ord($s[$byte_index+2]);
return (($b1 & 0x0f) << 12) + (($b2 & 0x3f) << 6) + ($b3 & 0x3f);
}
}
/**
* Return the byte index given the UTF-8 sequence index.
* @param string $s UTF-8 encoded string.
* @param int $codepoint_index Index of the UTF-8 sequence, ranging from
* 0 (the first sequence) up to the length in characters of the
* string. Note that this last sequence does not exist because
* its byte index is just one byte above the last sequence so the
* index returned points to the byte just next to the end of the
* string.
* @return int Byte index of the UTF-8 sequence.
* @throws OutOfRangeException If the parameter is out of the
* range from 0 up to the length in characters of the string.
*/
static function byteIndex($s, $codepoint_index)
{
if( $codepoint_index < 0 )
throw new OutOfRangeException("$codepoint_index");
$s_len = strlen($s);
$byte_index = 0;
while( $codepoint_index > 0 ){
if( $byte_index >= $s_len )
throw new OutOfRangeException("$codepoint_index");
$b = ord($s[$byte_index]);
$seq_len = self::sequenceLength( ord($s[$byte_index]) );
if( $seq_len <= 0 )
# Just skip invalid byte.
$seq_len = 1;
$byte_index += $seq_len;
$codepoint_index--;
}
return $byte_index;
}
/**
* Returns the codepoint index given its byte index.
* @param string $s UTF-8 encoded string.
* @param int $byte_index Byte index of the codepoint, in
* [0,strlen($this->s)]. Note that if $byte_index is exactly equal
* to strlen($this->s), then the result is the length of the string
* in codepoints.
* @return int Byte index of this codepoint, that is the number of UTF-8
* sequences from the beginning of the string up there.
* @throws OutOfRangeException If $byte_index is out of the range
* [0,strlen($this->s)].
*/
static function codepointIndex($s, $byte_index)
{
# FIXME: terribly slow.
# Counts how many non-continuation bytes are
# in the range [0,$byte_index]:
if( $byte_index < 0 or $byte_index > strlen($s) )
throw new OutOfRangeException("$byte_index");
$codepoint_index = 0;
$byte_index--;
while( $byte_index >= 0 ){
if( ! UTF8::isCont( ord($s[$byte_index]) ) )
$codepoint_index++;
$byte_index--;
}
return $codepoint_index;
}
/**
* Returns the code of the codepoint at the given index.
* @param string $s UTF-8 encoded string.
* @param int $codepoint_index Index of the codepoint, in the range from 0
* up to the length of the string minus one. Note that for an
* empty string there is no valid range.
* @return int Code of the codepoint.
* @throws OutOfRangeException If the index is invalid.
*/
function codepointAt($s, $codepoint_index)
{
return self::codepointAtByteIndex($s, self::byteIndex($s, $codepoint_index));
}
/**
* Return the length of the string as number of characters.
* @param string $s UTF-8 encoded string.
* @return int Length of the string as number of characters.
*/
static function length($s)
{
// Count non-continuation bytes:
$len = 0;
for($i = strlen($s) - 1; $i >= 0; $i--)
if( ! UTF8::isCont(ord($s[$i]) ) )
$len++;
return $len;
}
/**
* Returns the character at the given index.
* @param string $s UTF-8 encoded string.
* @param int $i Index of the character in the range from 0 up to
* UTF8::length($s)-1.
* @return string The character as a UTF-8 string. The returned string
* may contain from 1 up to 3 bytes.
* @throws OutOfRangeException If the index is invalid.
*/
function charAt($s, $i)
{
try {
$byte_index = self::byteIndex($s, $i);
}
catch(OutOfRangeException $e){
throw new OutOfRangeException("$i");
}
if( $byte_index >= strlen($s) )
throw new OutOfRangeException("$byte_index");
$seq_len = self::sequenceLength( ord($s[$byte_index]) );
if( $seq_len <= 0 or $byte_index + $seq_len > strlen($s) )
return "?";
return substr($s, $byte_index, $seq_len);
}
}
Total: 278 lines, 8127 bytes.