diff --git a/src/Strings/Multibyte/OmniString.php b/src/Strings/Multibyte/OmniString.php --- a/src/Strings/Multibyte/OmniString.php +++ b/src/Strings/Multibyte/OmniString.php @@ -63,6 +63,28 @@ return mb_strlen($this->value, $this->encoding); } + public function getChars () : array { + $chars = []; + + $len = $this->len(); + for ($i = 0 ; $i < $len ; $i++) { + $chars[] = mb_substr($this->value, $i, 1, $this->encoding); + } + + return $chars; + } + + public function getBigrams () { + $bigrams = []; + + $len = $this->len(); + for ($i = 0 ; $i < $len - 1 ; $i++) { + $bigrams[] = mb_substr($this->value, $i, 2, $this->encoding); + } + + return $bigrams; + } + /** * @return string */ @@ -77,4 +99,5 @@ $this->value = $value; } + } diff --git a/src/Strings/SorensenDiceCoefficient.php b/src/Strings/SorensenDiceCoefficient.php new file mode 100644 --- /dev/null +++ b/src/Strings/SorensenDiceCoefficient.php @@ -0,0 +1,55 @@ +x = (new OmniString($left))->getBigrams(); + $this->y = (new OmniString($right))->getBigrams(); + } + + public static function computeFor(string $left, string $right) : float { + $instance = new self($left, $right); + + return $instance->compute(); + } + + /// + /// Sørensen formula + /// + + public function compute() : float { + return 2 * $this->countIntersect() + / + $this->countCharacters(); + } + + private function countIntersect () : int { + $intersect = array_intersect($this->x, $this->y); + + return count($intersect); + } + + private function countCharacters () : int { + return count($this->x) + count($this->y); + } + +} diff --git a/tests/Strings/Multibyte/OmniStringTest.php b/tests/Strings/Multibyte/OmniStringTest.php --- a/tests/Strings/Multibyte/OmniStringTest.php +++ b/tests/Strings/Multibyte/OmniStringTest.php @@ -49,4 +49,50 @@ $this->assertEquals(3, $this->string->len()); } + /** + * @dataProvider provideCharactersArrays + */ + public function testGetChars (string $string, array $expectedCharacters) : void { + $actualCharacters = (new OmniString($string))->getChars(); + + $this->assertEquals($expectedCharacters, $actualCharacters); + } + + /** + * @dataProvider provideCharactersBigrams + */ + public function testBigrams (string $string, array $expectedBigrams) : void { + $actualBigrams = (new OmniString($string))->getBigrams(); + + $this->assertEquals($expectedBigrams, $actualBigrams); + } + + /// + /// Data providers + /// + + public function provideCharactersArrays () : iterable { + yield ["foo", ['f', 'o', 'o']]; + + yield [ + 'àèòàFOOàèòà', + ['à', 'è', 'ò', 'à', 'F', 'O', 'O', 'à', 'è', 'ò', 'à'] + ]; + + yield ["🇩🇪", ["🇩", "🇪"]]; + + yield ["", []]; + } + + + public function provideCharactersBigrams () : iterable { + yield ["foo", ['fo', 'oo']]; + + yield ["night", ['ni', 'ig', 'gh', 'ht']]; + + yield ["🇩🇪", ["🇩🇪"]]; + + yield ["", []]; + } + } diff --git a/tests/Strings/SorensenDiceCoefficientTest.php b/tests/Strings/SorensenDiceCoefficientTest.php new file mode 100644 --- /dev/null +++ b/tests/Strings/SorensenDiceCoefficientTest.php @@ -0,0 +1,16 @@ +assertEquals(0.25, $actual->compute()); + } +}