Source for file Normalise.php
Documentation is available at Normalise.php
* ----------------------------------------------------------------------
* Copyright (c) 2006-2016 Khaled Al-Sham'aa.
* ----------------------------------------------------------------------
* This program is open source product; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License (LGPL)
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/lgpl.txt>.
* ----------------------------------------------------------------------
* Class Name: Functions to normalise Arabic text.
* Filename: Normalise.php
* Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org>
* Purpose: Text normalisation through various stages. Also: unshaping.
* ----------------------------------------------------------------------
* This class provides various functions to manipulate arabic text and
* normalise it by applying filters, for example, to strip tatweel and
* tashkeel, to normalise hamza and lamalephs, and to unshape
* a joined Arabic text back into its normalised form.
* There is also a function to reverse a utf8 string.
* The functions are helpful for searching, indexing and similar
* Note that this class can only deal with UTF8 strings. You can use functions
* from the other classes to convert between encodings if necessary.
* include('./I18N/Arabic.php');
* $obj = new I18N_Arabic('Normalise');
* $str = "Arabic text with tatweel, tashkeel...";
* echo "<p><u><i>Before:</i></u><br />$str<br /><br />";
* $text = $obj->stripTatweel($str);
* echo "<u><i>After:</i></u><br />$text<br /><br />";
* @author Djihed Afifi <djihed@gmail.com>
* @copyright 2006-2016 Khaled Al-Sham'aa
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
* This class provides various functions to manipulate arabic text and
* normalise it by applying filters, for example, to strip tatweel and
* tashkeel, to normalise hamza and lamalephs, and to unshape
* a joined Arabic text back into its normalised form.
* The functions are helpful for searching, indexing and similar
* @author Djihed Afifi <djihed@gmail.com>
* @copyright 2006-2016 Khaled Al-Sham'aa
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
private $_unshapeMap = array();
private $_unshapeKeys = array();
private $_unshapeValues = array();
private $_chars = array();
private $_charGroups = array();
private $_charArNames = array();
* Load the Unicode constants that will be used ibn substitutions
public function __construct()
include dirname(__FILE__ ) . '/data/charset/ArUnicode.constants.php';
$this->_unshapeMap = $ligature_map;
$this->_unshapeKeys = array_keys($this->_unshapeMap);
$this->_chars = $char_names;
$this->_charGroups = $char_groups;
$this->_charArNames = $char_ar_names;
* Strip all tatweel characters from an Arabic text.
* @param string $text The text to be stripped.
* @return string the stripped text.
* @author Djihed Afifi <djihed@gmail.com>
return str_replace($this->_chars['TATWEEL'], '', $text);
* Strip all tashkeel characters from an Arabic text.
* @param string $text The text to be stripped.
* @return string the stripped text.
* @author Djihed Afifi <djihed@gmail.com>
$this->_chars['FATHATAN'],
$this->_chars['DAMMATAN'],
$this->_chars['KASRATAN'],
* Normalise all Hamza characters to their corresponding aleph
* character in an Arabic text.
* @param string $text The text to be normalised.
* @return string the normalised text.
* @author Djihed Afifi <djihed@gmail.com>
$this->_chars['WAW_HAMZA'] = $this->_chars['WAW'],
$this->_chars['YEH_HAMZA'] = $this->_chars['YEH'],
$this->_chars['ALEF_MADDA'],
$this->_chars['ALEF_HAMZA_ABOVE'],
$this->_chars['ALEF_HAMZA_BELOW'],
$this->_chars['HAMZA_ABOVE,HAMZA_BELOW']
$text = str_replace($alephs, $this->_chars['ALEF'], $text);
* Unicode uses some special characters where the lamaleph and any
* hamza above them are combined into one code point. Some input
* system use them. This function expands these characters.
* @param string $text The text to be normalised.
* @return string the normalised text.
* @author Djihed Afifi <djihed@gmail.com>
$this->_chars['LAM_ALEPH'],
$this->_chars['LAM_ALEPH_HAMZA_ABOVE'],
$simple_LAM_ALEPH_HAMZA_ABOVE,
$this->_chars['LAM_ALEPH_HAMZA_BELOW'],
$simple_LAM_ALEPH_HAMZA_BELOW,
$this->_chars['LAM_ALEPH_MADDA_ABOVE'],
$simple_LAM_ALEPH_MADDA_ABOVE,
* Return unicode char by its code point.
* @param char $u code point
* @return string the result character.
* @author Djihed Afifi <djihed@gmail.com>
* Takes a string, it applies the various filters in this class
* to return a unicode normalised string suitable for activities
* such as searching, indexing, etc.
* @param string $text the text to be normalised.
* @return string the result normalised string.
* @author Djihed Afifi <djihed@gmail.com>
* Takes Arabic text in its joined form, it untangles the characters
* This can be used to process text that was processed through OCR
* or by extracting text from a PDF document.
* Note that the result text may need further processing. In most
* cases, you will want to use the utf8Strrev function from
* this class to reverse the string.
* Most of the work of setting up the characters for this function
* is done through the ArUnicode.constants.php constants and
* the constructor loading.
* @param string $text the text to be unshaped.
* @return string the result normalised string.
* @author Djihed Afifi <djihed@gmail.com>
return str_replace($this->_unshapeKeys, $this->_unshapeValues, $text);
* Take a UTF8 string and reverse it.
* @param string $str the string to be reversed.
* @param boolean $reverse_numbers whether to reverse numbers.
* @return string The reversed string.
public function utf8Strrev($str, $reverse_numbers = false)
foreach ($ar[0] as $value) {
foreach ($temp as $key => $value2) {
* Checks for Arabic Tashkeel marks (i.e. FATHA, DAMMA, KASRA, SUKUN,
* SHADDA, FATHATAN, DAMMATAN, KASRATAN).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Tashkeel mark
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['TASHKEEL'])) {
* Checks for Arabic Harakat marks (i.e. FATHA, DAMMA, KASRA, SUKUN, TANWIN).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Harakat mark
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['HARAKAT'])) {
* Checks for Arabic short Harakat marks (i.e. FATHA, DAMMA, KASRA, SUKUN).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic short Harakat mark
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['SHORTHARAKAT'])) {
* Checks for Arabic Tanwin marks (i.e. FATHATAN, DAMMATAN, KASRATAN).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Tanwin mark
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['TANWIN'])) {
* Checks for Arabic Ligatures like LamAlef (i.e. LAM ALEF, LAM ALEF HAMZA
* ABOVE, LAM ALEF HAMZA BELOW, LAM ALEF MADDA ABOVE).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Ligatures like LamAlef
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['LIGUATURES'])) {
* Checks for Arabic Hamza forms (i.e. HAMZA, WAW HAMZA, YEH HAMZA, HAMZA ABOVE,
* HAMZA BELOW, ALEF HAMZA BELOW, ALEF HAMZA ABOVE).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Hamza form
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['HAMZAT'])) {
* Checks for Arabic Alef forms (i.e. ALEF, ALEF MADDA, ALEF HAMZA ABOVE,
* ALEF HAMZA BELOW,ALEF WASLA, ALEF MAKSURA).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Alef form
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
public function isAlef($archar)
if (in_array($key, $this->_charGroups['ALEFAT'])) {
* Checks for Arabic Weak letters (i.e. ALEF, WAW, YEH, ALEF_MAKSURA).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Weak letter
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
public function isWeak($archar)
if (in_array($key, $this->_charGroups['WEAK'])) {
* Checks for Arabic Yeh forms (i.e. YEH, YEH HAMZA, SMALL YEH, ALEF MAKSURA).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Yeh form
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['YEHLIKE'])) {
* Checks for Arabic Waw like forms (i.e. WAW, WAW HAMZA, SMALL WAW).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Waw like form
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['WAWLIKE'])) {
* Checks for Arabic Teh forms (i.e. TEH, TEH MARBUTA).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Teh form
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['TEHLIKE'])) {
* Checks for Arabic Small letters (i.e. SMALL ALEF, SMALL WAW, SMALL YEH).
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Small letter
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
if (in_array($key, $this->_charGroups['SMALL'])) {
* Checks for Arabic Moon letters.
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Moon letter
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
public function isMoon($archar)
if (in_array($key, $this->_charGroups['MOON'])) {
* Checks for Arabic Sun letters.
* @param string $archar Arabic unicode char
* @return boolean True if it is Arabic Sun letter
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
public function isSun($archar)
if (in_array($key, $this->_charGroups['SUN'])) {
* Return Arabic letter name in arabic.
* @param string $archar Arabic unicode char
* @return string Arabic letter name in arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
$name = $this->_charArNames["$key"];
|