Page MenuHomeDevCentral

D2550.diff
No OneTemporary

D2550.diff

diff --git a/omnitools/src/Strings/Multibyte/OmniString.php b/omnitools/src/Strings/Multibyte/OmniString.php
--- a/omnitools/src/Strings/Multibyte/OmniString.php
+++ b/omnitools/src/Strings/Multibyte/OmniString.php
@@ -5,6 +5,10 @@
use Keruald\OmniTools\Collections\Vector;
+/**
+ * Represents a multibyte string and perform operations with the grapheme
+ * library for UTF-8 encoding, and mbstring for other encodings.
+ */
class OmniString {
use WithEncoding;
@@ -13,10 +17,7 @@
/// Private members
///
- /**
- * @var string
- */
- private $value;
+ private string $value;
///
/// Constructor
@@ -61,25 +62,82 @@
return str_ends_with($this->value, $end);
}
+ /**
+ * @deprecated Use more specific method to express your intent:
+ * countBytes, countCodePoints or countGraphemes
+ */
public function len () : int {
+ return $this->countGraphemes();
+ }
+
+ public function countBytes () : int {
+ return strlen($this->value);
+ }
+
+ public function countCodePoints () : int {
return mb_strlen($this->value, $this->encoding);
}
- public function getChars () : array {
+ public function countGraphemes () : int {
+ return match ($this->encoding) {
+ "UTF-8" => grapheme_strlen($this->value),
+ default => $this->countCodepoints(),
+ };
+ }
+
+ public function getBytes() : array {
+ return str_split($this->value, 1);
+ }
+
+ public function getCodePoints () : array {
+ return mb_str_split($this->value, 1, $this->encoding);
+ }
+
+ public function getGraphemes () : array {
+ if ($this->encoding !== "UTF-8") {
+ return $this->getCodePoints();
+ }
+
$chars = [];
- $len = $this->len();
+ $len = grapheme_strlen($this->value);
for ($i = 0 ; $i < $len ; $i++) {
- $chars[] = mb_substr($this->value, $i, 1, $this->encoding);
+ $chars[] = grapheme_substr($this->value, $i, 1);
}
return $chars;
}
+ /**
+ * @deprecated Use more specific method to express your intent:
+ * getBytes, getCodePoints or getGraphemes
+ */
+ public function getChars () : array {
+ return $this->getGraphemes();
+ }
+
public function getBigrams () : array {
+ return match ($this->encoding) {
+ "UTF-8" => $this->getBigramsFromGraphemes(),
+ default => $this->getBigramsFromCodePoints(),
+ };
+ }
+
+ private function getBigramsFromGraphemes() : array {
+ $bigrams = [];
+
+ $len = grapheme_strlen($this->value);
+ for ($i = 0 ; $i < $len - 1 ; $i++) {
+ $bigrams[] = grapheme_substr($this->value, $i, 2);
+ }
+
+ return $bigrams;
+ }
+
+ private function getBigramsFromCodePoints() : array {
$bigrams = [];
- $len = $this->len();
+ $len = mb_strlen($this->value, $this->encoding);
for ($i = 0 ; $i < $len - 1 ; $i++) {
$bigrams[] = mb_substr($this->value, $i, 2, $this->encoding);
}
diff --git a/omnitools/src/Strings/Multibyte/StringUtilities.php b/omnitools/src/Strings/Multibyte/StringUtilities.php
--- a/omnitools/src/Strings/Multibyte/StringUtilities.php
+++ b/omnitools/src/Strings/Multibyte/StringUtilities.php
@@ -29,18 +29,15 @@
->setPadLength($padLength)
->setPadString($padString)
->setPadType($padType)
- ->setEncoding($encoding ?: mb_internal_encoding())
+ ->setEncoding($encoding ?: "UTF-8")
->pad();
}
public static function isSupportedEncoding (string $encoding) : bool {
- foreach (mb_list_encodings() as $supportedEncoding) {
- if ($encoding === $supportedEncoding) {
- return true;
- }
- }
-
- return false;
+ return match ($encoding) {
+ "UTF-8" => true,
+ default => in_array($encoding, mb_list_encodings()),
+ };
}
/**
diff --git a/omnitools/tests/Strings/Multibyte/OmniStringTest.php b/omnitools/tests/Strings/Multibyte/OmniStringTest.php
--- a/omnitools/tests/Strings/Multibyte/OmniStringTest.php
+++ b/omnitools/tests/Strings/Multibyte/OmniStringTest.php
@@ -3,6 +3,7 @@
namespace Keruald\OmniTools\Tests\Strings\Multibyte;
+use Keruald\OmniTools\Collections\Vector;
use Keruald\OmniTools\Strings\Multibyte\OmniString;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
@@ -47,6 +48,24 @@
$this->assertEquals(3, $this->string->len());
}
+ #[DataProvider("provideLengthCounts")]
+ public function testCountBytes ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countBytes();
+ $this->assertEquals($bytes, $count);
+ }
+
+ #[DataProvider("provideLengthCounts")]
+ public function testCountCodePoints ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countCodePoints();
+ $this->assertEquals($codePoints, $count);
+ }
+
+ #[DataProvider("provideLengthCounts")]
+ public function testCountGraphemes ($string, $bytes, $codePoints, $graphemes) : void {
+ $count = (new OmniString($string))->countGraphemes();
+ $this->assertEquals($graphemes, $count);
+ }
+
#[DataProvider('provideCharactersArrays')]
public function testGetChars (string $string, array $expectedCharacters) : void {
$actualCharacters = (new OmniString($string))->getChars();
@@ -54,6 +73,27 @@
$this->assertEquals($expectedCharacters, $actualCharacters);
}
+ #[DataProvider("provideBytes")]
+ public function testGetBytes (string $string, array $expectedBytes) : void {
+ $actual = (new OmniString($string))->getBytes();
+
+ $this->assertEquals($expectedBytes, $actual);
+ }
+
+ #[DataProvider("provideCodePoints")]
+ public function testGetCodePoints (string $string, array $expectedCodePoints) : void {
+ $actual = (new OmniString($string))->getCodePoints();
+
+ $this->assertEquals($expectedCodePoints, $actual);
+ }
+
+ #[DataProvider("provideGraphemes")]
+ public function testGetGraphemes (string $string, array $expectedGraphemes) : void {
+ $actual = (new OmniString($string))->getGraphemes();
+
+ $this->assertEquals($expectedGraphemes, $actual);
+ }
+
#[DataProvider('provideCharactersBigrams')]
public function testBigrams (string $string, array $expectedBigrams) : void {
$actualBigrams = (new OmniString($string))->getBigrams();
@@ -89,7 +129,7 @@
['à', 'è', 'ò', 'à', 'F', 'O', 'O', 'à', 'è', 'ò', 'à']
];
- yield ["🇩🇪", ["🇩", "🇪"]];
+ yield ["🇩🇪", ["🇩🇪"]];
yield ["", []];
}
@@ -99,10 +139,13 @@
yield ["night", ['ni', 'ig', 'gh', 'ht']];
- yield ["🇩🇪", ["🇩🇪"]];
+ yield ["x", []]; // Only one character -> no bigram
+
+ yield ["🇩🇪", []]; // Only one character -> no bigram
yield ["", []];
}
+
public static function provideExplosions () : iterable {
yield ["/", "a/b/c", ['a', 'b', 'c']];
yield ["/", "abc", ['abc']];
@@ -113,4 +156,59 @@
yield ["x", "a/b/c", ['a/b/c']];
}
+ public static function provideLengthCounts () : iterable {
+ // Character, bytes, code points, graphemes
+ yield ["🏴󠁧󠁢󠁥󠁮󠁧󠁿", 28, 7, 1];
+
+ yield ["", 0, 0, 0];
+ yield ["a", 1, 1, 1];
+ yield ["foo", 3, 3, 3];
+ yield ["é", 2, 1, 1];
+
+ yield ["\0", 1, 1, 1]; // PHP strings are NOT null-terminated
+ }
+
+ public static function provideBytes () : iterable {
+ yield ["🏴󠁧󠁢󠁥󠁮󠁧󠁿", [
+ "\xF0", "\x9F", "\x8F", "\xB4",
+ "\xF3", "\xA0", "\x81", "\xA7",
+ "\xF3", "\xA0", "\x81", "\xA2",
+ "\xF3", "\xA0", "\x81", "\xA5",
+ "\xF3", "\xA0", "\x81", "\xAE",
+ "\xF3", "\xA0", "\x81", "\xA7",
+ "\xF3", "\xA0", "\x81", "\xBF",
+ ]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["\xC3", "\xA9"]];
+ }
+
+ public static function provideCodePoints () : iterable {
+ yield ["🏴󠁧󠁢󠁥󠁮󠁧󠁿", [
+ "\xF0\x9F\x8F\xB4",
+ "\xF3\xA0\x81\xA7",
+ "\xF3\xA0\x81\xA2",
+ "\xF3\xA0\x81\xA5",
+ "\xF3\xA0\x81\xAE",
+ "\xF3\xA0\x81\xA7",
+ "\xF3\xA0\x81\xBF",
+ ]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["é"]];
+ }
+
+ public static function provideGraphemes () : iterable {
+ yield ["🏴󠁧󠁢󠁥󠁮󠁧󠁿", ["🏴󠁧󠁢󠁥󠁮󠁧󠁿"]];
+
+ yield ["", []];
+ yield ["a", ["a"]];
+ yield ["foo", ["f", "o", "o"]];
+ yield ["é", ["é"]];
+ }
+
}

File Metadata

Mime Type
text/plain
Expires
Fri, Nov 22, 13:15 (8 h, 5 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2256226
Default Alt Text
D2550.diff (9 KB)

Event Timeline