File Source for Stemmer.php

<?php
/**
* ----------------------------------------------------------------------
*
* Copyright (c) 2006-2016 Khaled Al-Sham'aa.
*
* http://www.ar-php.org
*
* PHP Version 5
*
* ----------------------------------------------------------------------
*
* LICENSE
*
* This program is open source product; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License (LGPL)
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/lgpl.txt>.
*
* ----------------------------------------------------------------------
*
* Class Name: Arabic Text ArStemmer Class
*
* Filename: Stemmer.php
*
* Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org>
*
* Purpose: Get stem of an Arabic word
*
* ----------------------------------------------------------------------
*
* Source: http://arabtechies.net/node/83
* By: Taha Zerrouki <taha.zerrouki@gmail.com>
*
* ----------------------------------------------------------------------
*
* Arabic Word Stemmer Class
*
* PHP class to get stem of an Arabic word
*
* A stemmer is an automatic process in which morphological variants of terms
* are mapped to a single representative string called a stem. Arabic belongs
* to the Semitic family of languages which also includes Hebrew and Aramaic.
* Since morphological change in Arabic results from the addition of prefixes
* and infixes as well as suffixes, simple removal of suffixes is not as
* effective for Arabic as it is for English.
*
* Arabic has much richer morphology than English. Arabic has two genders,
* feminine and masculine; three numbers, singular, dual, and plural; and three
* grammatical cases, nominative, genitive, and accusative. A noun has the
* nominative case when it is a subject; accusative when it is the object of a
* verb; and genitive when it is the object of a preposition. The form of an
* Arabic noun is determined by its gender, number, and grammatical case. The
* definitive nouns are formed by attaching the Arabic article "AL" to the
* immediate front of the nouns. Besides prefixes, a noun can also carry a
* suffix which is often a possessive pronoun. In Arabic, the conjunction word
* "WA" (and) is often attached to the following word.
*
* Like nouns, an Arabic adjective can also have many variants. When an
* adjective modifies a noun in a noun phrase, the adjective agrees with the
* noun in gender, number, case, and definiteness. Arabic verbs have two tenses:
* perfect and imperfect. Perfect tense denotes actions completed, while
* imperfect denotes uncompleted actions. The imperfect tense has four mood:
* indicative, subjective, jussive, and imperative. Arabic verbs in perfect
* tense consist of a stem and a subject marker. The subject marker indicates
* the person, gender, and number of the subject. The form of a verb in perfect
* tense can have subject marker and pronoun suffix. The form of a
* subject-marker is determined together by the person, gender, and number of
* the subject.
* Example:
* <code>
* include('./I18N/Arabic.php');
* $obj = new I18N_Arabic('Stemmer');
*
* echo $obj->stem($word);
* </code>
*
* @category I18N
* @package I18N_Arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
* @copyright 2006-2016 Khaled Al-Sham'aa
*
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
*/
/**
* This PHP class get stem of an Arabic word
*
* @category I18N
* @package I18N_Arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
* @copyright 2006-2016 Khaled Al-Sham'aa
*
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
*/
class I18N_Arabic_Stemmer
{
private static $_verbPre = 'وأسفلي';
private static $_verbPost = 'ومكانيه';
private static $_verbMay;
private static $_verbMaxPre = 4;
private static $_verbMaxPost = 6;
private static $_verbMinStem = 2;
private static $_nounPre = 'ابفكلوأ';
private static $_nounPost = 'اتةكمنهوي';
private static $_nounMay;
private static $_nounMaxPre = 4;
private static $_nounMaxPost = 6;
private static $_nounMinStem = 2;
/**
* Loads initialize values
*
* @ignore
*/
public function __construct()
{
self::$_verbMay = self::$_verbPre . self::$_verbPost;
self::$_nounMay = self::$_nounPre . self::$_nounPost;
}
/**
* Get rough stem of the given Arabic word
*
* @param string $word Arabic word you would like to get its stem
*
* @return string Arabic stem of the word
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public static function stem($word)
{
$nounStem = self::roughStem(
$word, self::$_nounMay, self::$_nounPre, self::$_nounPost,
self::$_nounMaxPre, self::$_nounMaxPost, self::$_nounMinStem
);
$verbStem = self::roughStem(
$word, self::$_verbMay, self::$_verbPre, self::$_verbPost,
self::$_verbMaxPre, self::$_verbMaxPost, self::$_verbMinStem
);
if (mb_strlen($nounStem, 'UTF-8') < mb_strlen($verbStem, 'UTF-8')) {
$stem = $nounStem;
} else {
$stem = $verbStem;
}
return $stem;
}
/**
* Get rough stem of the given Arabic word (under specific rules)
*
* @param string $word Arabic word you would like to get its stem
* @param string $notChars Arabic chars those can't be in postfix or prefix
* @param string $preChars Arabic chars those may exists in the prefix
* @param string $postChars Arabic chars those may exists in the postfix
* @param integer $maxPre Max prefix length
* @param integer $maxPost Max postfix length
* @param integer $minStem Min stem length
*
* @return string Arabic stem of the word under giving rules
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
protected static function roughStem (
$word, $notChars, $preChars, $postChars, $maxPre, $maxPost, $minStem
) {
$right = -1;
$left = -1;
$max = mb_strlen($word, 'UTF-8');
for ($i=0; $i < $max; $i++) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($notChars, $needle, 0, 'UTF-8') === false) {
if ($right == -1) {
$right = $i;
}
$left = $i;
}
}
if ($right > $maxPre) {
$right = $maxPre;
}
if ($max - $left - 1 > $maxPost) {
$left = $max - $maxPost -1;
}
for ($i=0; $i < $right; $i++) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($preChars, $needle, 0, 'UTF-8') === false) {
$right = $i;
break;
}
}
for ($i=$max-1; $i>$left; $i--) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($postChars, $needle, 0, 'UTF-8') === false) {
$left = $i;
break;
}
}
if ($left - $right >= $minStem) {
$stem = mb_substr($word, $right, $left-$right+1, 'UTF-8');
} else {
$stem = null;
}
return $stem;
}
}

Source for file Stemmer.php