* @copyright 2011-2024 Nicola Asuni - Tecnick.com LTD * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) * @link https://github.com/tecnickcom/tc-lib-unicode * * This file is part of tc-lib-unicode software library. */ namespace Com\Tecnick\Unicode; use Com\Tecnick\Unicode\Bidi\Shaping; use Com\Tecnick\Unicode\Bidi\StepI; use Com\Tecnick\Unicode\Bidi\StepL; use Com\Tecnick\Unicode\Bidi\StepN; use Com\Tecnick\Unicode\Bidi\StepP; use Com\Tecnick\Unicode\Bidi\StepW; use Com\Tecnick\Unicode\Bidi\StepX; use Com\Tecnick\Unicode\Bidi\StepXten; use Com\Tecnick\Unicode\Data\Pattern as UniPattern; use Com\Tecnick\Unicode\Data\Type as UniType; use Com\Tecnick\Unicode\Exception as UnicodeException; /** * Com\Tecnick\Unicode\Bidi * * @since 2015-07-13 * @category Library * @package Unicode * @author Nicola Asuni * @copyright 2011-2024 Nicola Asuni - Tecnick.com LTD * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) * @link https://github.com/tecnickcom/tc-lib-unicode */ class Bidi { /** * String to process */ protected string $str = ''; /** * Array of UTF-8 chars * * @var array */ protected array $chrarr = []; /** * Array of UTF-8 codepoints * * @var array */ protected array $ordarr = []; /** * Processed string */ protected string $bidistr = ''; /** * Array of processed UTF-8 chars * * @var array */ protected array $bidichrarr = []; /** * Array of processed UTF-8 codepoints * * @var array */ protected array $bidiordarr = []; /** * If 'R' forces RTL, if 'L' forces LTR */ protected string $forcedir = ''; /** * If true enable shaping */ protected bool $shaping = true; /** * True if the string contains arabic characters */ protected bool $arabic = false; /** * Array of character data * * @var array */ protected array $chardata = []; /** * Convert object */ protected Convert $conv; /** * Reverse the RLT substrings using the Bidirectional Algorithm * http://unicode.org/reports/tr9/ * * @param ?string $str String to convert (if null it will be generated from $chrarr or $ordarr) * @param ?array $chrarr Array of UTF-8 chars (if empty it will be generated from $str or $ordarr) * @param ?array $ordarr Array of UTF-8 codepoints (if empty it will be generated from $str or $chrarr) * @param string $forcedir If 'R' forces RTL, if 'L' forces LTR * @param bool $shaping If true enable the shaping algorithm */ public function __construct( ?string $str = null, ?array $chrarr = null, ?array $ordarr = null, string $forcedir = '', bool $shaping = true ) { if (($str === null) && ($chrarr === null || $chrarr === []) && ($ordarr === null || $ordarr === [])) { throw new UnicodeException('empty input'); } $this->conv = new Convert(); $this->setInput($str, $chrarr, $ordarr, $forcedir); if (! $this->isRtlMode()) { $this->bidistr = $this->str; $this->bidichrarr = $this->chrarr; $this->bidiordarr = $this->ordarr; return; } $this->shaping = ($shaping && $this->arabic); $this->process(); } /** * Set Input data * * @param ?string $str String to convert (if null it will be generated from $chrarr or $ordarr) * @param ?array $chrarr Array of UTF-8 chars (if empty it will be generated from $str or $ordarr) * @param ?array $ordarr Array of UTF-8 codepoints (if empty it will be generated from $str or $chrarr) * @param string $forcedir If 'R' forces RTL, if 'L' forces LTR * * @SuppressWarnings("PHPMD.CyclomaticComplexity") */ protected function setInput( ?string $str = null, ?array $chrarr = null, ?array $ordarr = null, string $forcedir = '' ): void { if ($str === null) { if (($chrarr === null || $chrarr === []) && ($ordarr !== null && $ordarr !== [])) { $chrarr = $this->conv->ordArrToChrArr($ordarr); } $str = implode('', $chrarr); } if ($chrarr === null || $chrarr === []) { $chrarr = $this->conv->strToChrArr($str); } if ($ordarr === null || $ordarr === []) { $ordarr = $this->conv->chrArrToOrdArr($chrarr); } $this->str = $str; $this->chrarr = $chrarr; $this->ordarr = $ordarr; $this->forcedir = ''; if ($forcedir !== '') { $this->forcedir = strtoupper($forcedir[0]); } } /** * Returns the processed array of UTF-8 codepoints * * @return array */ public function getOrdArray(): array { return $this->bidiordarr; } /** * Returns the processed array of UTF-8 chars * * @return array */ public function getChrArray(): array { if ($this->bidichrarr === []) { $this->bidichrarr = $this->conv->ordArrToChrArr($this->bidiordarr); } return $this->bidichrarr; } /** * Returns the number of characters in the processed string */ public function getNumChars(): int { return count($this->getChrArray()); } /** * Returns the processed string */ public function getString(): string { if ($this->bidistr === '') { $this->bidistr = implode('', $this->getChrArray()); } return $this->bidistr; } /** * Returns an array with processed chars as keys * * @return array */ public function getCharKeys(): array { return array_fill_keys(array_values($this->bidiordarr), true); } /** * P1. Split the text into separate paragraphs. * A paragraph separator is kept with the previous paragraph. * * @return array> */ protected function getParagraphs(): array { $paragraph = [ 0 => [], ]; $pdx = 0; // paragraphs index foreach ($this->ordarr as $ord) { $paragraph[$pdx][] = $ord; if (isset(UniType::UNI[$ord]) && (UniType::UNI[$ord] == 'B')) { ++$pdx; $paragraph[$pdx] = []; } } return $paragraph; } /** * Process the string * * @SuppressWarnings("PHPMD.CyclomaticComplexity") */ protected function process(): void { // split the text into separate paragraphs. $paragraph = $this->getParagraphs(); // Within each paragraph, apply all the other rules of this algorithm. foreach ($paragraph as $par) { $pel = $this->getPel($par); $stepx = new StepX($par, $pel); $stepx10 = new StepXten($stepx->getChrData(), $pel); $ilrs = $stepx10->getIsolatedLevelRunSequences(); $chardata = []; $maxlevel = 0; foreach ($ilrs as $ilr) { $stepw = new StepW($ilr); $stepn = new StepN($stepw->getSequence()); $stepi = new StepI($stepn->getSequence()); $ilr = $stepi->getSequence(); if ($this->shaping) { $shaping = new Shaping($ilr); $ilr = $shaping->getSequence(); } $chardata = array_merge($chardata, $ilr['item']); if ($ilr['maxlevel'] > $maxlevel) { $maxlevel = $ilr['maxlevel']; } } $stepl = new StepL($chardata, $pel, $maxlevel); $chardata = $stepl->getChrData(); foreach ($chardata as $chardatum) { $this->bidiordarr[] = $chardatum['char']; } // add back the paragraph separators $lastchar = end($par); if ($lastchar === false) { continue; } if ($lastchar < 0) { continue; } if (! isset(UniType::UNI[$lastchar])) { continue; } if (UniType::UNI[$lastchar] != 'B') { continue; } $this->bidiordarr[] = $lastchar; } } /** * Get the paragraph embedding level * * @param array $par Paragraph */ protected function getPel($par): int { if ($this->forcedir === 'R') { return 1; } if ($this->forcedir === 'L') { return 0; } $stepp = new StepP($par); return $stepp->getPel(); } /** * Check if the input string contains RTL characters to process */ protected function isRtlMode(): bool { $this->arabic = (bool) preg_match(UniPattern::ARABIC, $this->str); return (($this->forcedir === 'R') || $this->arabic || preg_match(UniPattern::RTL, $this->str)); } }