Page Menu
Home
DevCentral
Search
Configure Global Search
Log In
Files
F11708808
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
15 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/omnitools/src/Strings/Multibyte/OmniString.php b/omnitools/src/Strings/Multibyte/OmniString.php
index 5e94aa3..7b8b853 100644
--- a/omnitools/src/Strings/Multibyte/OmniString.php
+++ b/omnitools/src/Strings/Multibyte/OmniString.php
@@ -1,125 +1,183 @@
<?php
declare(strict_types=1);
namespace Keruald\OmniTools\Strings\Multibyte;
use Keruald\OmniTools\Collections\Vector;
+/**
+ * Represents a multibyte string and perform operations with the grapheme
+ * library for UTF-8 encoding, and mbstring for other encodings.
+ */
class OmniString {
use WithEncoding;
///
/// Private members
///
- /**
- * @var string
- */
- private $value;
+ private string $value;
///
/// Constructor
///
public function __construct (string $value = '', string $encoding = '') {
$this->value = $value;
$this->setEncoding($encoding ?: "UTF-8");
}
///
/// Magic methods
///
public function __toString() : string {
return $this->value;
}
///
/// Helper methods
///
public function pad(
int $padLength = 0,
string $padString = ' ',
int $padType = STR_PAD_RIGHT
) : string {
return (new StringPad)
->setInput($this->value)
->setEncoding($this->encoding)
->setPadLength($padLength)
->setPadString($padString)
->setPadType($padType)
->pad();
}
public function startsWith (string $start) : bool {
return str_starts_with($this->value, $start);
}
public function endsWith (string $end) : bool {
return str_ends_with($this->value, $end);
}
+ /**
+ * @deprecated Use more specific method to express your intent:
+ * countBytes, countCodePoints or countGraphemes
+ */
public function len () : int {
+ return $this->countGraphemes();
+ }
+
+ public function countBytes () : int {
+ return strlen($this->value);
+ }
+
+ public function countCodePoints () : int {
return mb_strlen($this->value, $this->encoding);
}
- public function getChars () : array {
+ public function countGraphemes () : int {
+ return match ($this->encoding) {
+ "UTF-8" => grapheme_strlen($this->value),
+ default => $this->countCodepoints(),
+ };
+ }
+
+ public function getBytes() : array {
+ return str_split($this->value, 1);
+ }
+
+ public function getCodePoints () : array {
+ return mb_str_split($this->value, 1, $this->encoding);
+ }
+
+ public function getGraphemes () : array {
+ if ($this->encoding !== "UTF-8") {
+ return $this->getCodePoints();
+ }
+
$chars = [];
- $len = $this->len();
+ $len = grapheme_strlen($this->value);
for ($i = 0 ; $i < $len ; $i++) {
- $chars[] = mb_substr($this->value, $i, 1, $this->encoding);
+ $chars[] = grapheme_substr($this->value, $i, 1);
}
return $chars;
}
+ /**
+ * @deprecated Use more specific method to express your intent:
+ * getBytes, getCodePoints or getGraphemes
+ */
+ public function getChars () : array {
+ return $this->getGraphemes();
+ }
+
public function getBigrams () : array {
+ return match ($this->encoding) {
+ "UTF-8" => $this->getBigramsFromGraphemes(),
+ default => $this->getBigramsFromCodePoints(),
+ };
+ }
+
+ private function getBigramsFromGraphemes() : array {
+ $bigrams = [];
+
+ $len = grapheme_strlen($this->value);
+ for ($i = 0 ; $i < $len - 1 ; $i++) {
+ $bigrams[] = grapheme_substr($this->value, $i, 2);
+ }
+
+ return $bigrams;
+ }
+
+ private function getBigramsFromCodePoints() : array {
$bigrams = [];
- $len = $this->len();
+ $len = mb_strlen($this->value, $this->encoding);
for ($i = 0 ; $i < $len - 1 ; $i++) {
$bigrams[] = mb_substr($this->value, $i, 2, $this->encoding);
}
return $bigrams;
}
///
/// Transformation methods
///
public function explode (string $delimiter,
int $limit = PHP_INT_MAX) : Vector {
if ($delimiter === "") {
if ($limit < 0) {
return new Vector;
}
return new Vector([$this->value]);
}
return new Vector(explode($delimiter, $this->value, $limit));
}
///
/// Getters and setters
///
/**
* @return string
*/
public function getValue () : string {
return $this->value;
}
/**
* @param string $value
*/
public function setValue (string $value) : void {
$this->value = $value;
}
}
diff --git a/omnitools/src/Strings/Multibyte/StringUtilities.php b/omnitools/src/Strings/Multibyte/StringUtilities.php
index 7702467..7172642 100644
--- a/omnitools/src/Strings/Multibyte/StringUtilities.php
+++ b/omnitools/src/Strings/Multibyte/StringUtilities.php
@@ -1,97 +1,94 @@
<?php
declare(strict_types=1);
namespace Keruald\OmniTools\Strings\Multibyte;
class StringUtilities {
/**
* Pads a multibyte string to a certain length with another string
*
* @param string $input the input string
* @param int $padLength the target string size
* @param string $padString the padding characters (optional, default is space)
* @param int $padType STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH (optional, default is STR_PAD_RIGHT)
* @param string $encoding the character encoding (optional)
*
* @return string the padded string
*
*/
public static function pad (
string $input,
int $padLength,
string $padString = ' ',
int $padType = STR_PAD_RIGHT,
string $encoding = ''
) : string {
return (new StringPad)
->setInput($input)
->setPadLength($padLength)
->setPadString($padString)
->setPadType($padType)
- ->setEncoding($encoding ?: mb_internal_encoding())
+ ->setEncoding($encoding ?: "UTF-8")
->pad();
}
public static function isSupportedEncoding (string $encoding) : bool {
- foreach (mb_list_encodings() as $supportedEncoding) {
- if ($encoding === $supportedEncoding) {
- return true;
- }
- }
-
- return false;
+ return match ($encoding) {
+ "UTF-8" => true,
+ default => in_array($encoding, mb_list_encodings()),
+ };
}
/**
* @deprecated Since PHP 8.0, we can replace by \str_starts_with
*/
public static function startsWith (string $string, string $start) : bool {
$length = mb_strlen($start);
return mb_substr($string, 0, $length) === $start;
}
/**
* @deprecated Since PHP 8.0, we can replace by \str_ends_with
*/
public static function endsWith (string $string, string $end) : bool {
$length = mb_strlen($end);
return $length === 0 || mb_substr($string, -$length) === $end;
}
/**
* @deprecated Since PHP 8.0, we can replace by \str_contains
*/
public static function contains (string $string, string $needle) : bool {
return str_contains($string, $needle);
}
/**
* Encode a string using a variant of the MIME base64 compatible with URLs.
*
* The '+' and '/' characters used in base64 are replaced by '-' and '_'.
* The '=' padding is removed.
*
* @param string $string The string to encode
* @return string The encoded string
*/
public static function encodeInBase64 (string $string) : string {
return str_replace(
['+', '/', '='],
['-', '_', ''],
base64_encode($string)
);
}
/**
* Decode a string encoded with StringUtilities::encodeInBase64
*
* @param string $string The string to decode
* @return string The decoded string
*/
public static function decodeFromBase64 (string $string) : string {
$toDecode = str_replace(['-', '_'], ['+', '/'], $string);
return base64_decode($toDecode);
}
}
diff --git a/omnitools/tests/Strings/Multibyte/OmniStringTest.php b/omnitools/tests/Strings/Multibyte/OmniStringTest.php
index 06fadf1..911069a 100644
--- a/omnitools/tests/Strings/Multibyte/OmniStringTest.php
+++ b/omnitools/tests/Strings/Multibyte/OmniStringTest.php
@@ -1,116 +1,214 @@
<?php
declare(strict_types=1);
namespace Keruald\OmniTools\Tests\Strings\Multibyte;
+use Keruald\OmniTools\Collections\Vector;
use Keruald\OmniTools\Strings\Multibyte\OmniString;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
class OmniStringTest extends TestCase {
private OmniString $string;
protected function setUp () : void {
$this->string = new OmniString("foo");
}
public function testToString () : void {
$this->assertEquals("foo", (string)$this->string);
$this->assertEquals("foo", $this->string->__toString());
}
public function testPad () : void {
$paddedString = $this->string->pad(9, '-=-', STR_PAD_BOTH);
$this->assertEquals("-=-foo-=-", $paddedString);
}
public function testStartsWith () : void {
$this->assertTrue($this->string->startsWith("fo"));
$this->assertTrue($this->string->startsWith(""));
$this->assertTrue($this->string->startsWith("foo"));
$this->assertFalse($this->string->startsWith("Fo"));
$this->assertFalse($this->string->startsWith("bar"));
}
public function testEndsWith () : void {
$this->assertTrue($this->string->endsWith("oo"));
$this->assertTrue($this->string->endsWith(""));
$this->assertTrue($this->string->endsWith("foo"));
$this->assertFalse($this->string->endsWith("oO"));
$this->assertFalse($this->string->endsWith("bar"));
}
public function testLen () : void {
$this->assertEquals(3, $this->string->len());
}
+ #[DataProvider("provideLengthCounts")]
+ public function testCountBytes ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countBytes();
+ $this->assertEquals($bytes, $count);
+ }
+
+ #[DataProvider("provideLengthCounts")]
+ public function testCountCodePoints ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countCodePoints();
+ $this->assertEquals($codePoints, $count);
+ }
+
+ #[DataProvider("provideLengthCounts")]
+ public function testCountGraphemes ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countGraphemes();
+ $this->assertEquals($graphemes, $count);
+ }
+
#[DataProvider('provideCharactersArrays')]
public function testGetChars (string $string, array $expectedCharacters) : void {
$actualCharacters = (new OmniString($string))->getChars();
$this->assertEquals($expectedCharacters, $actualCharacters);
}
+ #[DataProvider("provideBytes")]
+ public function testGetBytes (string $string, array $expectedBytes) : void {
+ $actual = (new OmniString($string))->getBytes();
+
+ $this->assertEquals($expectedBytes, $actual);
+ }
+
+ #[DataProvider("provideCodePoints")]
+ public function testGetCodePoints (string $string, array $expectedCodePoints) : void {
+ $actual = (new OmniString($string))->getCodePoints();
+
+ $this->assertEquals($expectedCodePoints, $actual);
+ }
+
+ #[DataProvider("provideGraphemes")]
+ public function testGetGraphemes (string $string, array $expectedGraphemes) : void {
+ $actual = (new OmniString($string))->getGraphemes();
+
+ $this->assertEquals($expectedGraphemes, $actual);
+ }
+
#[DataProvider('provideCharactersBigrams')]
public function testBigrams (string $string, array $expectedBigrams) : void {
$actualBigrams = (new OmniString($string))->getBigrams();
$this->assertEquals($expectedBigrams, $actualBigrams);
}
#[DataProvider('provideExplosions')]
public function testExplode (string $delimiter, string $imploded, array $exploded) : void {
$actual = (new OmniString($imploded))
->explode($delimiter)
->toArray();
$this->assertEquals($exploded, $actual);
}
public function testExplodeWithEmptyOmniArray () : void {
$array = (new OmniString("foo"))
->explode("", -1);
$this->assertEquals(0, count($array->toArray()));
}
///
/// Data providers
///
public static function provideCharactersArrays () : iterable {
yield ["foo", ['f', 'o', 'o']];
yield [
'àèòàFOOàèòà',
['à', 'è', 'ò', 'à', 'F', 'O', 'O', 'à', 'è', 'ò', 'à']
];
- yield ["🇩🇪", ["🇩", "🇪"]];
+ yield ["🇩🇪", ["🇩🇪"]];
yield ["", []];
}
public static function provideCharactersBigrams () : iterable {
yield ["foo", ['fo', 'oo']];
yield ["night", ['ni', 'ig', 'gh', 'ht']];
- yield ["🇩🇪", ["🇩🇪"]];
+ yield ["x", []]; // Only one character -> no bigram
+
+ yield ["🇩🇪", []]; // Only one character -> no bigram
yield ["", []];
}
+
public static function provideExplosions () : iterable {
yield ["/", "a/b/c", ['a', 'b', 'c']];
yield ["/", "abc", ['abc']];
yield ["/", "/b/c", ['', 'b', 'c']];
yield ["/", "a/b/", ['a', 'b', '']];
yield ["", "a/b/c", ['a/b/c']];
yield ["x", "a/b/c", ['a/b/c']];
}
+ public static function provideLengthCounts () : iterable {
+ // Character, bytes, code points, graphemes
+ yield ["🏴", 28, 7, 1];
+
+ yield ["", 0, 0, 0];
+ yield ["a", 1, 1, 1];
+ yield ["foo", 3, 3, 3];
+ yield ["é", 2, 1, 1];
+
+ yield ["\0", 1, 1, 1]; // PHP strings are NOT null-terminated
+ }
+
+ public static function provideBytes () : iterable {
+ yield ["🏴", [
+ "\xF0", "\x9F", "\x8F", "\xB4",
+ "\xF3", "\xA0", "\x81", "\xA7",
+ "\xF3", "\xA0", "\x81", "\xA2",
+ "\xF3", "\xA0", "\x81", "\xA5",
+ "\xF3", "\xA0", "\x81", "\xAE",
+ "\xF3", "\xA0", "\x81", "\xA7",
+ "\xF3", "\xA0", "\x81", "\xBF",
+ ]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["\xC3", "\xA9"]];
+ }
+
+ public static function provideCodePoints () : iterable {
+ yield ["🏴", [
+ "\xF0\x9F\x8F\xB4",
+ "\xF3\xA0\x81\xA7",
+ "\xF3\xA0\x81\xA2",
+ "\xF3\xA0\x81\xA5",
+ "\xF3\xA0\x81\xAE",
+ "\xF3\xA0\x81\xA7",
+ "\xF3\xA0\x81\xBF",
+ ]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["é"]];
+ }
+
+ public static function provideGraphemes () : iterable {
+ yield ["🏴", ["🏴"]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["é"]];
+ }
+
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Sep 15, 07:22 (1 d, 5 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2973885
Default Alt Text
(15 KB)
Attached To
Mode
rKERUALD Keruald libraries development repository
Attached
Detach File
Event Timeline
Log In to Comment