Page Menu
Home
DevCentral
Search
Configure Global Search
Log In
Files
F3762692
D2550.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Referenced Files
None
Subscribers
None
D2550.diff
View Options
diff --git a/omnitools/src/Strings/Multibyte/OmniString.php b/omnitools/src/Strings/Multibyte/OmniString.php
--- a/omnitools/src/Strings/Multibyte/OmniString.php
+++ b/omnitools/src/Strings/Multibyte/OmniString.php
@@ -5,6 +5,10 @@
use Keruald\OmniTools\Collections\Vector;
+/**
+ * Represents a multibyte string and perform operations with the grapheme
+ * library for UTF-8 encoding, and mbstring for other encodings.
+ */
class OmniString {
use WithEncoding;
@@ -13,10 +17,7 @@
/// Private members
///
- /**
- * @var string
- */
- private $value;
+ private string $value;
///
/// Constructor
@@ -61,25 +62,82 @@
return str_ends_with($this->value, $end);
}
+ /**
+ * @deprecated Use more specific method to express your intent:
+ * countBytes, countCodePoints or countGraphemes
+ */
public function len () : int {
+ return $this->countGraphemes();
+ }
+
+ public function countBytes () : int {
+ return strlen($this->value);
+ }
+
+ public function countCodePoints () : int {
return mb_strlen($this->value, $this->encoding);
}
- public function getChars () : array {
+ public function countGraphemes () : int {
+ return match ($this->encoding) {
+ "UTF-8" => grapheme_strlen($this->value),
+ default => $this->countCodepoints(),
+ };
+ }
+
+ public function getBytes() : array {
+ return str_split($this->value, 1);
+ }
+
+ public function getCodePoints () : array {
+ return mb_str_split($this->value, 1, $this->encoding);
+ }
+
+ public function getGraphemes () : array {
+ if ($this->encoding !== "UTF-8") {
+ return $this->getCodePoints();
+ }
+
$chars = [];
- $len = $this->len();
+ $len = grapheme_strlen($this->value);
for ($i = 0 ; $i < $len ; $i++) {
- $chars[] = mb_substr($this->value, $i, 1, $this->encoding);
+ $chars[] = grapheme_substr($this->value, $i, 1);
}
return $chars;
}
+ /**
+ * @deprecated Use more specific method to express your intent:
+ * getBytes, getCodePoints or getGraphemes
+ */
+ public function getChars () : array {
+ return $this->getGraphemes();
+ }
+
public function getBigrams () : array {
+ return match ($this->encoding) {
+ "UTF-8" => $this->getBigramsFromGraphemes(),
+ default => $this->getBigramsFromCodePoints(),
+ };
+ }
+
+ private function getBigramsFromGraphemes() : array {
+ $bigrams = [];
+
+ $len = grapheme_strlen($this->value);
+ for ($i = 0 ; $i < $len - 1 ; $i++) {
+ $bigrams[] = grapheme_substr($this->value, $i, 2);
+ }
+
+ return $bigrams;
+ }
+
+ private function getBigramsFromCodePoints() : array {
$bigrams = [];
- $len = $this->len();
+ $len = mb_strlen($this->value, $this->encoding);
for ($i = 0 ; $i < $len - 1 ; $i++) {
$bigrams[] = mb_substr($this->value, $i, 2, $this->encoding);
}
diff --git a/omnitools/src/Strings/Multibyte/StringUtilities.php b/omnitools/src/Strings/Multibyte/StringUtilities.php
--- a/omnitools/src/Strings/Multibyte/StringUtilities.php
+++ b/omnitools/src/Strings/Multibyte/StringUtilities.php
@@ -29,18 +29,15 @@
->setPadLength($padLength)
->setPadString($padString)
->setPadType($padType)
- ->setEncoding($encoding ?: mb_internal_encoding())
+ ->setEncoding($encoding ?: "UTF-8")
->pad();
}
public static function isSupportedEncoding (string $encoding) : bool {
- foreach (mb_list_encodings() as $supportedEncoding) {
- if ($encoding === $supportedEncoding) {
- return true;
- }
- }
-
- return false;
+ return match ($encoding) {
+ "UTF-8" => true,
+ default => in_array($encoding, mb_list_encodings()),
+ };
}
/**
diff --git a/omnitools/tests/Strings/Multibyte/OmniStringTest.php b/omnitools/tests/Strings/Multibyte/OmniStringTest.php
--- a/omnitools/tests/Strings/Multibyte/OmniStringTest.php
+++ b/omnitools/tests/Strings/Multibyte/OmniStringTest.php
@@ -3,6 +3,7 @@
namespace Keruald\OmniTools\Tests\Strings\Multibyte;
+use Keruald\OmniTools\Collections\Vector;
use Keruald\OmniTools\Strings\Multibyte\OmniString;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
@@ -47,6 +48,24 @@
$this->assertEquals(3, $this->string->len());
}
+ #[DataProvider("provideLengthCounts")]
+ public function testCountBytes ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countBytes();
+ $this->assertEquals($bytes, $count);
+ }
+
+ #[DataProvider("provideLengthCounts")]
+ public function testCountCodePoints ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countCodePoints();
+ $this->assertEquals($codePoints, $count);
+ }
+
+ #[DataProvider("provideLengthCounts")]
+ public function testCountGraphemes ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countGraphemes();
+ $this->assertEquals($graphemes, $count);
+ }
+
#[DataProvider('provideCharactersArrays')]
public function testGetChars (string $string, array $expectedCharacters) : void {
$actualCharacters = (new OmniString($string))->getChars();
@@ -54,6 +73,27 @@
$this->assertEquals($expectedCharacters, $actualCharacters);
}
+ #[DataProvider("provideBytes")]
+ public function testGetBytes (string $string, array $expectedBytes) : void {
+ $actual = (new OmniString($string))->getBytes();
+
+ $this->assertEquals($expectedBytes, $actual);
+ }
+
+ #[DataProvider("provideCodePoints")]
+ public function testGetCodePoints (string $string, array $expectedCodePoints) : void {
+ $actual = (new OmniString($string))->getCodePoints();
+
+ $this->assertEquals($expectedCodePoints, $actual);
+ }
+
+ #[DataProvider("provideGraphemes")]
+ public function testGetGraphemes (string $string, array $expectedGraphemes) : void {
+ $actual = (new OmniString($string))->getGraphemes();
+
+ $this->assertEquals($expectedGraphemes, $actual);
+ }
+
#[DataProvider('provideCharactersBigrams')]
public function testBigrams (string $string, array $expectedBigrams) : void {
$actualBigrams = (new OmniString($string))->getBigrams();
@@ -89,7 +129,7 @@
['à', 'è', 'ò', 'à', 'F', 'O', 'O', 'à', 'è', 'ò', 'à']
];
- yield ["🇩🇪", ["🇩", "🇪"]];
+ yield ["🇩🇪", ["🇩🇪"]];
yield ["", []];
}
@@ -99,10 +139,13 @@
yield ["night", ['ni', 'ig', 'gh', 'ht']];
- yield ["🇩🇪", ["🇩🇪"]];
+ yield ["x", []]; // Only one character -> no bigram
+
+ yield ["🇩🇪", []]; // Only one character -> no bigram
yield ["", []];
}
+
public static function provideExplosions () : iterable {
yield ["/", "a/b/c", ['a', 'b', 'c']];
yield ["/", "abc", ['abc']];
@@ -113,4 +156,59 @@
yield ["x", "a/b/c", ['a/b/c']];
}
+ public static function provideLengthCounts () : iterable {
+ // Character, bytes, code points, graphemes
+ yield ["🏴", 28, 7, 1];
+
+ yield ["", 0, 0, 0];
+ yield ["a", 1, 1, 1];
+ yield ["foo", 3, 3, 3];
+ yield ["é", 2, 1, 1];
+
+ yield ["\0", 1, 1, 1]; // PHP strings are NOT null-terminated
+ }
+
+ public static function provideBytes () : iterable {
+ yield ["🏴", [
+ "\xF0", "\x9F", "\x8F", "\xB4",
+ "\xF3", "\xA0", "\x81", "\xA7",
+ "\xF3", "\xA0", "\x81", "\xA2",
+ "\xF3", "\xA0", "\x81", "\xA5",
+ "\xF3", "\xA0", "\x81", "\xAE",
+ "\xF3", "\xA0", "\x81", "\xA7",
+ "\xF3", "\xA0", "\x81", "\xBF",
+ ]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["\xC3", "\xA9"]];
+ }
+
+ public static function provideCodePoints () : iterable {
+ yield ["🏴", [
+ "\xF0\x9F\x8F\xB4",
+ "\xF3\xA0\x81\xA7",
+ "\xF3\xA0\x81\xA2",
+ "\xF3\xA0\x81\xA5",
+ "\xF3\xA0\x81\xAE",
+ "\xF3\xA0\x81\xA7",
+ "\xF3\xA0\x81\xBF",
+ ]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["é"]];
+ }
+
+ public static function provideGraphemes () : iterable {
+ yield ["🏴", ["🏴"]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["é"]];
+ }
+
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Nov 22, 13:15 (8 h, 5 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2256226
Default Alt Text
D2550.diff (9 KB)
Attached To
Mode
D2550: Support Grapheme functions for UTF-8 strings
Attached
Detach File
Event Timeline
Log In to Comment