├── .gitignore ├── GPL.txt ├── README.md ├── qurancorpus ├── __init__.py ├── constants.py ├── corpus.py ├── data │ ├── __init__.py │ └── quranic-corpus-morpology.xml └── setup.py └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | **/build/ 13 | develop-eggs/ 14 | **/dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /GPL.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Quran Corpus Python Parser 2 | Python library to read and extract information from the [Quranic Arabic Corpus](http://corpus.quran.com/). 3 | 4 | ### Requirments 5 | 6 | - pyparsing: 7 | 8 | sudo pip install pyparsing 9 | - [quranic-corpus-morpology.xml](http://corpus.quran.com/download/) , version 0.1 10 | 11 | 12 | ### Install 13 | 14 | sudo pip install qurancorpus 15 | or 16 | 17 | sudo python setup.py install 18 | 19 | ### Usage 20 | - Parse a morphology line 21 | 22 | >>> from qurancorpus import MorphologyParser 23 | >>> MorphologyParser.parse("fa+ POS:INTG LEM:maA l:P+") 24 | {'prefixes': [{'token': 'fa', 'type': '--undefined--'}], 'base': [{'lemma': 'maA', 'arabiclemma': u'\u0645\u064e\u0627', 'arabicpos': u'\u062d\u0631\u0641 \u0627\u0633\u062a\u0641\u0647\u0627\u0645', 'type': 'Particles', 'pos': 'Interogative particle'}], 'suffixes': []} 25 | 26 | - List corpus words: 27 | 28 | >>> from qurancorpus import API 29 | >>> A = API(source="data/quranic-corpus-morpology.xml") 30 | A.all_words_generator() # all words 31 | A.unique_words() # unique only 32 | -------------------------------------------------------------------------------- /qurancorpus/__init__.py: -------------------------------------------------------------------------------- 1 | from corpus import API, MorphologyParser 2 | 3 | __all__=[API, MorphologyParser] -------------------------------------------------------------------------------- /qurancorpus/constants.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | """ 4 | @author: Assem Chelli 5 | @contact: assem.ch [at] gmail.com 6 | @license: GPL 7 | 8 | """ 9 | 10 | #:buckwalter code 11 | BUCKWALTER2UNICODE = {"'": u"\u0621", # hamza-on-the-line 12 | "|": u"\u0622", # madda 13 | ">": u"\u0623", # hamza-on-'alif 14 | "&": u"\u0624", # hamza-on-waaw 15 | "<": u"\u0625", # hamza-under-'alif 16 | "}": u"\u0626", # hamza-on-yaa' 17 | "A": u"\u0627", # bare 'alif 18 | "b": u"\u0628", # baa' 19 | "p": u"\u0629", # taa' marbuuTa 20 | "t": u"\u062A", # taa' 21 | "v": u"\u062B", # thaa' 22 | "j": u"\u062C", # jiim 23 | "H": u"\u062D", # Haa' 24 | "x": u"\u062E", # khaa' 25 | "d": u"\u062F", # daal 26 | "*": u"\u0630", # dhaal 27 | "r": u"\u0631", # raa' 28 | "z": u"\u0632", # zaay 29 | "s": u"\u0633", # siin 30 | "$": u"\u0634", # shiin 31 | "S": u"\u0635", # Saad 32 | "D": u"\u0636", # Daad 33 | "T": u"\u0637", # Taa' 34 | "Z": u"\u0638", # Zaa' (DHaa') 35 | "E": u"\u0639", # cayn 36 | "g": u"\u063A", # ghayn 37 | "_": u"\u0640", # taTwiil 38 | "f": u"\u0641", # faa' 39 | "q": u"\u0642", # qaaf 40 | "k": u"\u0643", # kaaf 41 | "l": u"\u0644", # laam 42 | "m": u"\u0645", # miim 43 | "n": u"\u0646", # nuun 44 | "h": u"\u0647", # haa' 45 | "w": u"\u0648", # waaw 46 | "Y": u"\u0649", # 'alif maqSuura 47 | "y": u"\u064A", # yaa' 48 | "F": u"\u064B", # fatHatayn 49 | "N": u"\u064C", # Dammatayn 50 | "K": u"\u064D", # kasratayn 51 | "a": u"\u064E", # fatHa 52 | "u": u"\u064F", # Damma 53 | "i": u"\u0650", # kasra 54 | "~": u"\u0651", # shaddah 55 | "o": u"\u0652", # sukuun 56 | "`": u"\u0670", # dagger 'alif 57 | "{": u"\u0671", # waSla 58 | # extended here 59 | "^": u"\u0653", # Maddah 60 | "#": u"\u0654", # HamzaAbove 61 | 62 | ":": "\u06DC", # SmallHighSeen 63 | "@": "\u06DF", # SmallHighRoundedZero 64 | "\"": "\u06E0", # SmallHighUprightRectangularZero 65 | "[": "\u06E2", # SmallHighMeemIsolatedForm 66 | ";": "\u06E3", # SmallLowSeen 67 | ",": "\u06E5", # SmallWaw 68 | ".": "\u06E6", # SmallYa 69 | "!": "\u06E8", # SmallHighNoon 70 | "-": "\u06EA", # EmptyCentreLowStop 71 | "+": "\u06EB", # EmptyCentreHighStop 72 | "%": "\u06EC", # RoundedHighStopWithFilledCentre 73 | "]": "\u06ED" # 74 | 75 | } 76 | 77 | POS = { 78 | "N": (u"اسم", "Noun"), 79 | "PN": (u"اسم علم", "Proper noun"), 80 | "IMPN": (u"اسم فعل أمر", "Imperative verbal noun"), 81 | "PRON": (u"ضمير", "Personal pronoun"), 82 | "DEM": (u"اسم اشارة", "Demonstrative pronoun"), 83 | "REL": (u"اسم موصول", "Relative pronoun"), 84 | "ADJ": (u"صفة", "Adjective"), 85 | "NUM": (u"رقم", "Number"), 86 | "T": (u"ظرف زمان", "Time adverb"), 87 | "LOC": (u"ظرف مكان", "Location adverb"), 88 | "V": (u"فعل", "Verb"), 89 | "P": (u"حرف جر", "Preposition"), 90 | "EMPH": (u"لام التوكيد", "Emphatic lām prefix"), 91 | "IMPV": (u"لام الامر", "Imperative lām prefix"), 92 | "PRP": (u"لام التعليل", "Purpose lām prefix"), 93 | "CONJ": (u"حرف عطف", "Coordinating conjunction"), 94 | "SUB": (u"حرف مصدري", "Subordinating conjunction"), 95 | "ACC": (u"حرف نصب", "Accusative particle"), 96 | "AMD": (u"حرف استدراك", "Amendment particle"), 97 | "ANS": (u"حرف جواب", "Answer particle"), 98 | "AVR": (u"حرف ردع", "Aversion particle"), 99 | "CAUS": (u"حرف سببية", "Particle of cause"), 100 | "CERT": (u"حرف تحقيق", "Particle of certainty"), 101 | "COND": (u"حرف شرط", "Conditional particle"), 102 | "EQ": (u"حرف تسوية", "Equalization particle"), 103 | "EXH": (u"حرف تحضيض", "Exhortation particle"), 104 | "EXL": (u"حرف تفصيل", "Explanation particle"), 105 | "EXP": (u"أداة استثناء", "Exceptive particle"), 106 | "FUT": (u"حرف استقبال", "Future particle"), 107 | "INC": (u"حرف ابتداء", "Inceptive particle"), 108 | "INTG": (u"حرف استفهام", "Interogative particle"), 109 | "NEG": (u"حرف نفي", "Negative particle"), 110 | "PREV": (u"حرف كاف", "Preventive particle"), 111 | "PRO": (u"حرف نهي", "Prohibition particle"), 112 | "REM": (u"حرف استئنافية", "Resumption particle"), 113 | "RES": (u"أداة حصر", "Restriction particle"), 114 | "RET": (u"حرف اضراب", "Retraction particle"), 115 | "SUP": (u"حرف زائد", "Supplemental particle"), 116 | "SUR": (u"حرف فجاءة", "Surprise particle"), 117 | "VOC": (u"حرف نداء", "Vocative particle"), 118 | "INL": (u"حروف مقطعة", "Quranic initials") 119 | } 120 | 121 | POSclass = { 122 | "Nouns": ["N", "PN", "IMPN"], 123 | "Pronouns": ["DEM", "REL", "PRON"], 124 | "Nominals": ["ADJ", "NUM"], 125 | "Adverbs": ["T", "LOC"], 126 | "Verbs": ["V"], 127 | "Prepositions": ["P"], 128 | "lām Prefixes": ["EMPH", "IMPV", "PRP"], 129 | "Conjunctions": ["CONJ", "SUB"], 130 | "Particles": ["ACC", "AMD", "ANS", "AVR", "CAUS", "CERT", "COND", "EQ", "EXH", "EXL", "EXP", "FUT", "INC", "INTG", 131 | "NEG", "PREV", "PRO", "REM", "RES", "RET", "SUP", "SUR", "VOC"], 132 | "Disconnected Letters": ["INL"] 133 | } 134 | 135 | PREFIXclass = { 136 | "determiner": ["Al+"], 137 | "preposition": ["bi+", "ka+", "ta+", "l:P+"], 138 | "future particle": ["sa+"], 139 | "vocative particle": ["ya+", "ha+"], 140 | "interrogative particle": ["A:INTG+"], 141 | "equalization particle": ["A:EQ+"], 142 | "conjunction": ["wa+", "f:CONJ+"], 143 | "resumption": ["w:P+"], 144 | "cause": ["f:CAUS+"], 145 | "emphasis": ["l:EMPH+"], 146 | "purpose": ["l:PRP+"], 147 | "imperative": ["l:IMPV+"], 148 | "--undefined--": ["A+", "fa+"] 149 | } 150 | 151 | PREFIX = { 152 | "Al+": (u"ال", u"al"), 153 | "bi+": (u"ب", u"bi"), 154 | "ka+": (u"ك", u"ka"), 155 | "ta+": (u"ت", u"ta"), 156 | "sa+": (u"س", u"sa"), 157 | "ya+": (u"يا", u"yā"), 158 | "ha+": (u"ها", u"hā"), 159 | "A+": (u"أ", u"alif"), 160 | "A:INTG+": (u"أ", u"alif"), 161 | "A:EQ+": (u"أ", u"alif"), 162 | "wa+": (u"و", u"wa"), 163 | "w:P+": (u"و", u"wa"), 164 | "fa+": (u"ف", u"fa"), 165 | "f:CONJ+": (u"ف", u"fa"), 166 | "f:REM+": (u"ف", u"fa"), 167 | "f:CAUS+": (u"ف", u"fa"), 168 | "l:P+": (u"ل", u"lām"), 169 | "l:EMPH+": (u"ل", u"lām"), 170 | "l:PRP+": (u"ل", u"lām"), 171 | "l:IMPV+": (u"ل", u"lām"), 172 | } 173 | 174 | PGNclass = { 175 | "person": ["1", "2", "3"], 176 | "number": ["S", "D", "P"], 177 | "gender": ["M", "F"] 178 | } 179 | 180 | PGN = { 181 | "1": u"متكلم", 182 | "2": u"مخاطب", 183 | "3": u"غائب", 184 | "M": u"مذّكر", 185 | "F": u"مؤنّث", 186 | "S": u"مفرد", 187 | "D": u"مثنّى", 188 | "P": u"جمع" 189 | } 190 | 191 | VERBclass = { 192 | "aspect": ["PERF", "IMPF", "IMPV"], 193 | "mood": ["IND", "SUBJ", "JUS", "ENG"], 194 | "voice": ["ACT", "PASS"], 195 | "form": ["(I)", "(II)", "(III)", "(IV)", "(V)", "(VI)", "(VII)", "(VIII)", "(IX)", "(X)", "(XI)", "(XII)"] 196 | } 197 | 198 | VERB = { 199 | "PERF": (u"فعل ماض", "Perfect verb"), 200 | "IMPF": (u"فعل مضارع", "Imperfect verb"), 201 | "IMPV": (u"فعل أمر", "Imperative verb"), 202 | "IND": (u"مرفوع", "Indicative mood"), 203 | "SUBJ": (u"منصوب", "Subjunctive mood"), 204 | "JUS": (u"مجزوم", "Jussive mood"), 205 | "ENG": (u"مؤكد", "Energetic mood"), 206 | "ACT": (u"مبني للمعلوم", "Active voice"), 207 | "PASS": (u"مبني للمجهول", "Passive voice"), 208 | "(I)": (u"", "First form"), 209 | "(II)": (u"", "Second form"), 210 | "(III)": (u"", "Third form"), 211 | "(IV)": (u"", "Fourth form"), 212 | "(V)": (u"", "Fifth form"), 213 | "(VI)": (u"", "Sixth form"), 214 | "(VII)": (u"", "Seventh form"), 215 | "(VIII)": (u"", "Eighth form"), 216 | "(IX)": (u"", "Ninth form"), 217 | "(X)": (u"", "Tenth form"), 218 | "(XI)": (u"", "Eleventh form"), 219 | "(XII)": (u"", "Twelfth form") 220 | } 221 | 222 | DERIVclass = { 223 | "derivation": ["ACT PCPL", "PASS PCPL", "VN"] 224 | } 225 | 226 | DERIV = { 227 | "ACT PCPL": (u"اسم فاعل", "Active participle"), 228 | "PASS PCPL": (u"اسم مفعول", "Passive participle"), 229 | "VN": (u"مصدر", "Verbal noun") 230 | } 231 | 232 | NOMclass = { 233 | "state": ["DEF", "INDEF"], 234 | "case": ["NOM", "ACC", "GEN"] 235 | } 236 | 237 | NOM = { 238 | "DEF": (u"معرفة", "Definite state"), 239 | "INDEF": (u"نكرة", "Indefinite state"), 240 | "NOM": (u"مرفوع", "Nominative case"), 241 | "ACC": (u"منصوب", "Accusative case"), 242 | "GEN": (u"مجرور", "Genitive case"), 243 | } 244 | 245 | PRON = { 246 | "*": {u"ني", u"نا", u"ك", u"كما", u"كم", u"ه", u"هما", u"هم", u"كن", u"ها", u"هن"}, 247 | "1": {u"ني", u"نا"}, 248 | "2": {u"ك", u"كما", u"كم", u"كن"}, 249 | "3": {u"ه", u"ها", u"هما", u"هم", u"هن"}, 250 | "M": {u"ني", u"نا", u"ك", u"كما", u"كم", u"ه", u"هما", u"هم"}, 251 | "F": {u"ني", u"نا", u"ك", u"كما", u"كن", u"ها", u"هما", u"هن"}, 252 | "S": {u"ني", u"ك", u"ه", u"ها"}, 253 | "D": {u"نا", u"كما", u"هما"}, 254 | "P": {u"نا", u"كم", u"هم", u"كن", u"هن"}, 255 | } 256 | -------------------------------------------------------------------------------- /qurancorpus/corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # encoding:utf-8 3 | 4 | """ 5 | @author: Assem Chelli 6 | @contact: assem.ch [at] gmail.com 7 | @license: GPL 8 | """ 9 | 10 | # import libxml2#,xpath 11 | import xml.etree.ElementTree 12 | from pyparsing import Keyword, Word, Group, Literal, CharsNotIn, alphas 13 | from pyparsing import SkipTo, ZeroOrMore, Optional, OneOrMore 14 | 15 | from constants import BUCKWALTER2UNICODE 16 | from constants import DERIV, PREFIX, PGN, POS, VERB, NOM, PRON 17 | from constants import DERIVclass, NOMclass, PGNclass, VERBclass, PREFIXclass, POSclass 18 | 19 | 20 | def reverse_class(dictionary): 21 | """ invert a dictionary """ 22 | newdict = {} 23 | for key, value in dictionary.iteritems(): 24 | if type(value) is not list: 25 | value = [value] 26 | for v in value: 27 | if v in newdict: 28 | newdict[v].append(key) 29 | else: 30 | newdict[v] = [key] 31 | return newdict 32 | 33 | 34 | def buck2uni(string): 35 | """ decode buckwalter """ 36 | result = "" 37 | for ch in string: 38 | result += BUCKWALTER2UNICODE[ch] 39 | return result 40 | 41 | 42 | def tag_keywords(list_tags): 43 | """a specific pyparsing term to match tags return Keywords 44 | """ 45 | res = list_tags[0] 46 | for item in list_tags[1:]: 47 | res |= Keyword(item) 48 | return res 49 | 50 | 51 | def tag_literals(list_tags): 52 | """a specific pyparsing term to match tages return Literals""" 53 | 54 | res = list_tags[0] 55 | for item in list_tags[1:]: 56 | res |= Literal(item) 57 | return res 58 | 59 | class MorphologyParser: 60 | """ parse the Morphology tags """ 61 | 62 | def __init__(self): 63 | pass 64 | 65 | @staticmethod 66 | def parse_step1(morph): 67 | """parse the field morphology of qurany corpus 68 | 69 | """ 70 | 71 | string = "$ " + str(morph).replace("POS:", "£ POS:") \ 72 | .replace("PRON:", "µ PRON:") \ 73 | .replace("<", "<") \ 74 | .replace(">", ">") + " #" 75 | # regular expressions 76 | begin = Keyword('$').suppress() 77 | center = Keyword('£').suppress() 78 | last = Keyword('µ').suppress() 79 | end = Keyword('#').suppress() 80 | skip = SkipTo(end).suppress() 81 | 82 | prefix = Word(alphas + "+" + ":") 83 | prefixes = Group(ZeroOrMore(~center + prefix)) 84 | 85 | genderK = tag_keywords(["M", "F"]) 86 | numberK = tag_keywords(["S", "D", "P"]) 87 | # personK = tag_keywords(["1", "2", "3"]) 88 | 89 | genderL = tag_literals(["M", "F"]) 90 | numberL = tag_literals(["S", "D", "P"]) 91 | personL = tag_literals(["1", "2", "3"]) 92 | 93 | person_ = personL + Optional(genderL) + Optional(numberL) 94 | gender_ = genderL + numberL 95 | 96 | gen = person_ | gender_ | numberK | genderK 97 | pos = "POS:" + Word(alphas) 98 | lem = "LEM:" + CharsNotIn(" ") 99 | root = "ROOT:" + CharsNotIn(" ") 100 | sp = "SP:" + CharsNotIn(" ") 101 | mood = "MOOD:" + CharsNotIn(" ") 102 | 103 | aspect = tag_keywords(["PERF", "IMPF", "IMPV"]) 104 | 105 | voice = tag_keywords(["ACT", "PASS"]) 106 | form = tag_keywords( 107 | ["(I)", "(II)", "(III)", "(IV)", "(V)", "(VI)", "(VII)", "(VIII)", "(IX)", "(X)", "(XI)", "(XII)"]) 108 | verb = aspect | voice | form 109 | 110 | voc = Keyword("+voc").suppress() 111 | 112 | deriv = tag_keywords(["ACT", "PCPL", "PASS", "VN"]) 113 | 114 | state = tag_keywords(["DEF", "INDEF"]) 115 | case = tag_keywords(["NOM", "ACC", "GEN"]) 116 | nom = case | state 117 | 118 | tag = lem | root | sp | mood | gen | verb | deriv | nom | voc | skip 119 | part = Group(center + pos + ZeroOrMore(~center + ~last + ~end + tag)) 120 | 121 | base = Group(OneOrMore(~end + ~last + part)) 122 | 123 | pron = "PRON:" + Group(gen) 124 | suffixes = Group(ZeroOrMore(~end + last + pron)) 125 | 126 | whole = begin + prefixes + base + suffixes + end 127 | 128 | parsed = whole.parseString(string) 129 | 130 | return parsed 131 | 132 | @staticmethod 133 | def parse_step2(parsedlist): 134 | """ return a dict """ 135 | Dict = {} 136 | # prefixes 137 | prefixes = parsedlist[0] 138 | Dict["prefixes"] = [] 139 | if prefixes: 140 | for prefix in prefixes: 141 | prefixDict = { 142 | "token": PREFIX[prefix][1], 143 | "arabictoken": PREFIX[prefix][0], 144 | "type": reverse_class(PREFIXclass)[prefix][0] 145 | } 146 | Dict["prefixes"].append(prefixDict) 147 | 148 | # word base 149 | parts = parsedlist[1] 150 | Dict["base"] = [] 151 | for part in parts: 152 | partDict = {} 153 | for i in range(len(part)): 154 | tag = part[i] 155 | if tag[-1] == ":": 156 | nexttag = part[i + 1] 157 | if tag == "POS:": 158 | partDict["type"] = reverse_class(POSclass)[nexttag][0] 159 | partDict["pos"] = POS[nexttag][1] 160 | partDict["arabicpos"] = POS[nexttag][0] 161 | elif tag == "ROOT:": 162 | partDict["root"] = nexttag 163 | partDict["arabicroot"] = buck2uni(nexttag) 164 | elif tag == "LEM:": 165 | partDict["lemma"] = nexttag 166 | partDict["arabiclemma"] = buck2uni(nexttag) 167 | elif tag == "SP:": 168 | partDict["special"] = nexttag 169 | partDict["arabicspecial"] = buck2uni(nexttag) 170 | elif tag == "MOOD:": 171 | partDict["mood"] = VERB[nexttag][1] 172 | partDict["arabicmood"] = VERB[nexttag][0] 173 | else: 174 | print "new tag!! " + tag 175 | i += 1 176 | else: 177 | if tag in PGN: 178 | partDict[reverse_class(PGNclass)[tag][0]] = PGN[tag] 179 | elif tag in ["ACT", "PASS"]: 180 | nexttag = part[i + 1] if i + 1 < len(part) else None 181 | if nexttag == "PCPL": 182 | partDict[reverse_class(DERIVclass)[tag + " PCPL"][0]] = DERIV[tag + " PCPL"][1] 183 | i += 1 184 | elif tag in VERB: 185 | partDict[reverse_class(VERBclass)[tag][0]] = VERB[tag][1] 186 | 187 | elif tag in NOM: 188 | def arabize(x): 189 | return "arabicstate" if x == "state" else "arabiccase" 190 | 191 | partDict[reverse_class(NOMclass)[tag][0]] = NOM[tag][1] 192 | partDict[arabize(reverse_class(NOMclass)[tag][0])] = NOM[tag][0] 193 | 194 | elif tag == "VN": 195 | partDict[reverse_class(DERIVclass)[tag][0]] = DERIV[tag][1] 196 | 197 | Dict["base"].append(partDict) 198 | 199 | # suffixes 200 | suffixes = parsedlist[2] 201 | Dict["suffixes"] = [] 202 | if suffixes: 203 | for i in range(len(suffixes)): 204 | tag = suffixes[i] 205 | if tag == "PRON:": 206 | pronDict = {} 207 | Pset = set(PRON["*"]) 208 | pronprops = suffixes[i + 1] 209 | for tag in pronprops: 210 | if tag in PGN: 211 | pronDict[reverse_class(PGNclass)[tag][0]] = PGN[tag] 212 | Pset &= PRON[tag] 213 | 214 | pronDict["arabictoken"] = Pset.pop() if Pset else "" 215 | Dict["suffixes"].append(pronDict) 216 | 217 | return Dict 218 | 219 | @staticmethod 220 | def parse(string): 221 | return MorphologyParser.parse_step2(MorphologyParser.parse_step1(string)) 222 | 223 | 224 | 225 | class API: 226 | def __init__(self, source="./data/quranic-corpus-morpology.xml"): 227 | """ 228 | init the API based on XMLfile 229 | @param source: the path of the xml file 230 | """ 231 | self.corpus = xml.etree.ElementTree.parse(source) 232 | # libxml2.parseFile(source) 233 | # print xpath.find('//item', source) 234 | 235 | 236 | def unique_words(self): 237 | """return a dictionary: the keys is word tokens and the values is the properties""" 238 | D = {} 239 | for chapter in self.corpus.findall(".//chapter"): 240 | for verse in chapter.findall("verse"): 241 | for word in verse.findall("word"): 242 | D[word.attrib["token"]] = MorphologyParser.parse(word.attrib["morphology"]) 243 | return D 244 | 245 | def all_words_generator(self): 246 | """ 247 | Generate words properties ,word by word 248 | """ 249 | for chapter in self.corpus.findall(".//chapter"): 250 | for verse in chapter.findall("verse"): 251 | for word in verse.findall("word"): 252 | res = word.attrib 253 | res["sura_id"] = int(chapter.attrib["number"]) 254 | res["aya_id"] = int(verse.attrib["number"]) 255 | res["word_id"] = int(word.attrib["number"]) 256 | res["word"] = word.attrib["token"] 257 | res["morphology"] = MorphologyParser.parse(word.attrib["morphology"]) 258 | yield res 259 | 260 | 261 | if __name__ == "__main__": 262 | A = API(source="data/quranic-corpus-morpology.xml") 263 | # for item in A.all_words_generator(): 264 | # print "(sura,aya,word):({0},{1},{2})".format(item["sura_id"], item["aya_id"], item["word_id"]) 265 | 266 | # for item in A.unique_words(): 267 | # print item 268 | 269 | # print MorphologyParser.parse("fa+ POS:INTG LEM:<maA ROOT:qawol l:P+ ") 270 | # print A.corpus.findtext("@number=’114’") 271 | -------------------------------------------------------------------------------- /qurancorpus/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/assem-ch/python-qurancorpus/ef9e879b76956ef0a7735ede25b702c267ef22ad/qurancorpus/data/__init__.py -------------------------------------------------------------------------------- /qurancorpus/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding = utf-8 3 | 4 | 5 | from setuptools import setup 6 | 7 | setup( 8 | name="qurancorpus", 9 | description="Arabic Quranic Corpus python API", 10 | version=0.1, 11 | platforms="ALL", 12 | license="GPL", 13 | packages=["qurancorpus"], 14 | install_requires=['pyparsing'], 15 | 16 | author="Assem Chelli", 17 | author_email="assem.ch@gmail.com", 18 | package_dir={'qurancorpus': '.'}, 19 | long_description="""A python api for the Quranic Arabic Corpus project""", 20 | keywords="quran arabic corpus quranic", 21 | 22 | include_package_data=True, 23 | 24 | data_files=[ ('./qurancorpus/data/',['data/quranic-corpus-morpology.xml'])], 25 | 26 | zip_safe=True, 27 | 28 | classifiers=[ 29 | "Development Status :: 4 - Beta", 30 | "Intended Audience :: Developers", 31 | "License :: OSI Approved :: GNU General Public License (GPL)", 32 | "Natural Language :: Arabic", 33 | "Natural Language :: English", 34 | "Operating System :: OS Independent", 35 | "Programming Language :: Python :: 2.6", 36 | "Topic :: Software Development :: Libraries :: Python Modules", 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd qurancorpus 4 | python setup.py register sdist bdist_egg upload 5 | --------------------------------------------------------------------------------