├── README.md ├── SpellCorrector.php └── big.txt /README.md: -------------------------------------------------------------------------------- 1 | # اصلاح غلط املایی و نگارشی فارسی 2 | 3 | این کد بر اساس کد فلیپ ریبرو توسعه داده شده است تا به کمک الگوریتم پیشنهادی پیتر نورویگ اشکالات نگارشی و غلط املایی متون و کلمات 4 | فارسی را اصلاح کند. 5 | 6 | برای استفاده از این کد به دیتابیسی از کلمات فارسی احتیاج دارید که باید در فایل متنی داخل پوشه پروژه قرار بگیرد، ما از دیتابیس زیر 7 | استفاده کرده ایم اما شما می توانید هر متن فارسی را با هر شکلی در فایل مذکور قرار دهید 8 | 9 | (https://github.com/shahind/Persian-Words-Database) 10 | 11 | دیتابیس فوق از کلمات فرهنگ معین، ویکی پدیای فارسی، اشعار شاعران فارسی و سایر کلمات تشکیل شده است و مجموعا شامل نزدیک به 750 هزار کلمه می شود 12 | 13 | 14 | # Persian-Spell-Corrector-PHP 15 | This is a PHP Script which is developed based on Felipe Ribeiro's Work(http://www.feliperibeiro.com) who developed a Spell Checker for English based on Peter Norvig's algorithm for PHP. 16 | 17 | I just changed it in order to use it for Persian words. 18 | a database of Persian words is used from Persian-Words-Database (https://github.com/shahind/Persian-Words-Database) 19 | 20 | Put your words into big.txt file, include the class in your project and use it: SpellCorrector::correct($string) 21 | -------------------------------------------------------------------------------- /SpellCorrector.php: -------------------------------------------------------------------------------- 1 | 38 | * @date September 18th, 2008 39 | * @package catalog 40 | * 41 | */ 42 | class SpellCorrector { 43 | private static $NWORDS; 44 | 45 | /** 46 | * Reads a text and extracts the list of words 47 | * 48 | * @param string $text 49 | * @return array The list of words 50 | */ 51 | private static function words($text) { 52 | $matches = array(); 53 | preg_match_all("/[آ ا ب پ ت ث ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی]+/",strtolower($text),$matches); 54 | return $matches[0]; 55 | } 56 | 57 | /** 58 | * Creates a table (dictionary) where the word is the key and the value is it's relevance 59 | * in the text (the number of times it appear) 60 | * 61 | * @param array $features 62 | * @return array 63 | */ 64 | private static function train(array $features) { 65 | $model = array(); 66 | $count = count($features); 67 | for($i = 0; $i<$count; $i++) { 68 | $f = $features[$i]; 69 | $model[$f] +=1; 70 | } 71 | return $model; 72 | } 73 | 74 | /** 75 | * Generates a list of possible "disturbances" on the passed string 76 | * 77 | * @param string $word 78 | * @return array 79 | */ 80 | private static function edits1($word) { 81 | $alphabet = ['ٍ','َ','ُ','ِ','ء','ئ','ی','ه','و','ن','م','ل','گ','ک','ق','ف','غ','ع','ظ','ط','ض','ص','ش','س','ژ','ز','ر','ذ','د','خ','ح','چ','ج','ث','ت','پ','ب','آ','ا','ً','ٌ']; 82 | $n = mb_strlen($word); 83 | $edits = array(); 84 | for($i = 0 ; $i<$n;$i++) { 85 | $edits[] = mb_substr($word,0,$i).mb_substr($word,$i+1); //deleting one char 86 | foreach($alphabet as $c) { 87 | $edits[] = mb_substr($word,0,$i) . $c . mb_substr($word,$i+1); //substituting one char 88 | } 89 | } 90 | for($i = 0; $i < $n-1; $i++) { 91 | $edits[] = mb_substr($word,0,$i).$word[$i+1].$word[$i].mb_substr($word,$i+2); //swapping chars order 92 | } 93 | for($i=0; $i < $n+1; $i++) { 94 | foreach($alphabet as $c) { 95 | $edits[] = mb_substr($word,0,$i).$c.mb_substr($word,$i); //inserting one char 96 | } 97 | } 98 | 99 | return $edits; 100 | } 101 | 102 | /** 103 | * Generate possible "disturbances" in a second level that exist on the dictionary 104 | * 105 | * @param string $word 106 | * @return array 107 | */ 108 | private static function known_edits2($word) { 109 | $known = array(); 110 | foreach(self::edits1($word) as $e1) { 111 | foreach(self::edits1($e1) as $e2) { 112 | if(array_key_exists($e2,self::$NWORDS)) $known[] = $e2; 113 | } 114 | } 115 | return $known; 116 | } 117 | 118 | /** 119 | * Given a list of words, returns the subset that is present on the dictionary 120 | * 121 | * @param array $words 122 | * @return array 123 | */ 124 | private static function known(array $words) { 125 | $known = array(); 126 | foreach($words as $w) { 127 | if(array_key_exists($w,self::$NWORDS)) { 128 | $known[] = $w; 129 | 130 | } 131 | } 132 | return $known; 133 | } 134 | 135 | 136 | /** 137 | * Returns the word that is present on the dictionary that is the most similar (and the most relevant) to the 138 | * word passed as parameter, 139 | * 140 | * @param string $word 141 | * @return string 142 | */ 143 | public static function correct($word) { 144 | $word = trim($word); 145 | if(empty($word)) return; 146 | if(is_numeric($word)) return $word; 147 | 148 | $word = strtolower($word); 149 | 150 | if(empty(self::$NWORDS)) { 151 | 152 | /* To optimize performance, the serialized dictionary can be saved on a file 153 | instead of parsing every single execution */ 154 | if(!file_exists('serialized_dictionary.txt')) { 155 | self::$NWORDS = self::train(self::words(file_get_contents("big.txt"))); 156 | $fp = fopen("serialized_dictionary.txt","w+"); 157 | fwrite($fp,serialize(self::$NWORDS)); 158 | fclose($fp); 159 | } else { 160 | self::$NWORDS = unserialize(file_get_contents("serialized_dictionary.txt")); 161 | } 162 | } 163 | $candidates = array(); 164 | if(self::known(array($word))) { 165 | return $word; 166 | } elseif(($tmp_candidates = self::known(self::edits1($word)))) { 167 | foreach($tmp_candidates as $candidate) { 168 | $candidates[] = $candidate; 169 | } 170 | } elseif(($tmp_candidates = self::known_edits2($word))) { 171 | foreach($tmp_candidates as $candidate) { 172 | $candidates[] = $candidate; 173 | } 174 | } else { 175 | return $word; 176 | } 177 | $max = 0; 178 | foreach($candidates as $c) { 179 | $value = self::$NWORDS[$c]; 180 | if( $value > $max) { 181 | $max = $value; 182 | $word = $c; 183 | } 184 | } 185 | return $word; 186 | } 187 | 188 | 189 | } 190 | 191 | ?> -------------------------------------------------------------------------------- /big.txt: -------------------------------------------------------------------------------- 1 | put your words here! 2 | --------------------------------------------------------------------------------