Files
EpiWebview/vendor/tecnickcom/tc-lib-unicode/src/Bidi.php

363 lines
9.5 KiB
PHP

<?php
/**
* Bidi.php
*
* @since 2011-05-23
* @category Library
* @package Unicode
* @author Nicola Asuni <info@tecnick.com>
* @copyright 2011-2024 Nicola Asuni - Tecnick.com LTD
* @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
* @link https://github.com/tecnickcom/tc-lib-unicode
*
* This file is part of tc-lib-unicode software library.
*/
namespace Com\Tecnick\Unicode;
use Com\Tecnick\Unicode\Bidi\Shaping;
use Com\Tecnick\Unicode\Bidi\StepI;
use Com\Tecnick\Unicode\Bidi\StepL;
use Com\Tecnick\Unicode\Bidi\StepN;
use Com\Tecnick\Unicode\Bidi\StepP;
use Com\Tecnick\Unicode\Bidi\StepW;
use Com\Tecnick\Unicode\Bidi\StepX;
use Com\Tecnick\Unicode\Bidi\StepXten;
use Com\Tecnick\Unicode\Data\Pattern as UniPattern;
use Com\Tecnick\Unicode\Data\Type as UniType;
use Com\Tecnick\Unicode\Exception as UnicodeException;
/**
* Com\Tecnick\Unicode\Bidi
*
* @since 2015-07-13
* @category Library
* @package Unicode
* @author Nicola Asuni <info@tecnick.com>
* @copyright 2011-2024 Nicola Asuni - Tecnick.com LTD
* @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
* @link https://github.com/tecnickcom/tc-lib-unicode
*/
class Bidi
{
/**
* String to process
*/
protected string $str = '';
/**
* Array of UTF-8 chars
*
* @var array<string>
*/
protected array $chrarr = [];
/**
* Array of UTF-8 codepoints
*
* @var array<int>
*/
protected array $ordarr = [];
/**
* Processed string
*/
protected string $bidistr = '';
/**
* Array of processed UTF-8 chars
*
* @var array<string>
*/
protected array $bidichrarr = [];
/**
* Array of processed UTF-8 codepoints
*
* @var array<int>
*/
protected array $bidiordarr = [];
/**
* If 'R' forces RTL, if 'L' forces LTR
*/
protected string $forcedir = '';
/**
* If true enable shaping
*/
protected bool $shaping = true;
/**
* True if the string contains arabic characters
*/
protected bool $arabic = false;
/**
* Array of character data
*
* @var array<int, array{
* 'char': int,
* 'i': int,
* 'level': int,
* 'otype': string,
* 'pdimatch': int,
* 'pos': int,
* 'type': string,
* 'x': int,
* }>
*/
protected array $chardata = [];
/**
* Convert object
*/
protected Convert $conv;
/**
* Reverse the RLT substrings using the Bidirectional Algorithm
* http://unicode.org/reports/tr9/
*
* @param ?string $str String to convert (if null it will be generated from $chrarr or $ordarr)
* @param ?array<string> $chrarr Array of UTF-8 chars (if empty it will be generated from $str or $ordarr)
* @param ?array<int> $ordarr Array of UTF-8 codepoints (if empty it will be generated from $str or $chrarr)
* @param string $forcedir If 'R' forces RTL, if 'L' forces LTR
* @param bool $shaping If true enable the shaping algorithm
*/
public function __construct(
?string $str = null,
?array $chrarr = null,
?array $ordarr = null,
string $forcedir = '',
bool $shaping = true
) {
if (($str === null) && ($chrarr === null || $chrarr === []) && ($ordarr === null || $ordarr === [])) {
throw new UnicodeException('empty input');
}
$this->conv = new Convert();
$this->setInput($str, $chrarr, $ordarr, $forcedir);
if (! $this->isRtlMode()) {
$this->bidistr = $this->str;
$this->bidichrarr = $this->chrarr;
$this->bidiordarr = $this->ordarr;
return;
}
$this->shaping = ($shaping && $this->arabic);
$this->process();
}
/**
* Set Input data
*
* @param ?string $str String to convert (if null it will be generated from $chrarr or $ordarr)
* @param ?array<string> $chrarr Array of UTF-8 chars (if empty it will be generated from $str or $ordarr)
* @param ?array<int> $ordarr Array of UTF-8 codepoints (if empty it will be generated from $str or $chrarr)
* @param string $forcedir If 'R' forces RTL, if 'L' forces LTR
*
* @SuppressWarnings("PHPMD.CyclomaticComplexity")
*/
protected function setInput(
?string $str = null,
?array $chrarr = null,
?array $ordarr = null,
string $forcedir = ''
): void {
if ($str === null) {
if (($chrarr === null || $chrarr === []) && ($ordarr !== null && $ordarr !== [])) {
$chrarr = $this->conv->ordArrToChrArr($ordarr);
}
$str = implode('', $chrarr);
}
if ($chrarr === null || $chrarr === []) {
$chrarr = $this->conv->strToChrArr($str);
}
if ($ordarr === null || $ordarr === []) {
$ordarr = $this->conv->chrArrToOrdArr($chrarr);
}
$this->str = $str;
$this->chrarr = $chrarr;
$this->ordarr = $ordarr;
$this->forcedir = '';
if ($forcedir !== '') {
$this->forcedir = strtoupper($forcedir[0]);
}
}
/**
* Returns the processed array of UTF-8 codepoints
*
* @return array<int>
*/
public function getOrdArray(): array
{
return $this->bidiordarr;
}
/**
* Returns the processed array of UTF-8 chars
*
* @return array<string>
*/
public function getChrArray(): array
{
if ($this->bidichrarr === []) {
$this->bidichrarr = $this->conv->ordArrToChrArr($this->bidiordarr);
}
return $this->bidichrarr;
}
/**
* Returns the number of characters in the processed string
*/
public function getNumChars(): int
{
return count($this->getChrArray());
}
/**
* Returns the processed string
*/
public function getString(): string
{
if ($this->bidistr === '') {
$this->bidistr = implode('', $this->getChrArray());
}
return $this->bidistr;
}
/**
* Returns an array with processed chars as keys
*
* @return array<int, true>
*/
public function getCharKeys(): array
{
return array_fill_keys(array_values($this->bidiordarr), true);
}
/**
* P1. Split the text into separate paragraphs.
* A paragraph separator is kept with the previous paragraph.
*
* @return array<int, array<int>>
*/
protected function getParagraphs(): array
{
$paragraph = [
0 => [],
];
$pdx = 0; // paragraphs index
foreach ($this->ordarr as $ord) {
$paragraph[$pdx][] = $ord;
if (isset(UniType::UNI[$ord]) && (UniType::UNI[$ord] == 'B')) {
++$pdx;
$paragraph[$pdx] = [];
}
}
return $paragraph;
}
/**
* Process the string
*
* @SuppressWarnings("PHPMD.CyclomaticComplexity")
*/
protected function process(): void
{
// split the text into separate paragraphs.
$paragraph = $this->getParagraphs();
// Within each paragraph, apply all the other rules of this algorithm.
foreach ($paragraph as $par) {
$pel = $this->getPel($par);
$stepx = new StepX($par, $pel);
$stepx10 = new StepXten($stepx->getChrData(), $pel);
$ilrs = $stepx10->getIsolatedLevelRunSequences();
$chardata = [];
$maxlevel = 0;
foreach ($ilrs as $ilr) {
$stepw = new StepW($ilr);
$stepn = new StepN($stepw->getSequence());
$stepi = new StepI($stepn->getSequence());
$ilr = $stepi->getSequence();
if ($this->shaping) {
$shaping = new Shaping($ilr);
$ilr = $shaping->getSequence();
}
$chardata = array_merge($chardata, $ilr['item']);
if ($ilr['maxlevel'] > $maxlevel) {
$maxlevel = $ilr['maxlevel'];
}
}
$stepl = new StepL($chardata, $pel, $maxlevel);
$chardata = $stepl->getChrData();
foreach ($chardata as $chardatum) {
$this->bidiordarr[] = $chardatum['char'];
}
// add back the paragraph separators
$lastchar = end($par);
if ($lastchar === false) {
continue;
}
if ($lastchar < 0) {
continue;
}
if (! isset(UniType::UNI[$lastchar])) {
continue;
}
if (UniType::UNI[$lastchar] != 'B') {
continue;
}
$this->bidiordarr[] = $lastchar;
}
}
/**
* Get the paragraph embedding level
*
* @param array<int> $par Paragraph
*/
protected function getPel($par): int
{
if ($this->forcedir === 'R') {
return 1;
}
if ($this->forcedir === 'L') {
return 0;
}
$stepp = new StepP($par);
return $stepp->getPel();
}
/**
* Check if the input string contains RTL characters to process
*/
protected function isRtlMode(): bool
{
$this->arabic = (bool) preg_match(UniPattern::ARABIC, $this->str);
return (($this->forcedir === 'R') || $this->arabic || preg_match(UniPattern::RTL, $this->str));
}
}