├── tests ├── skipif.inc ├── cld2_encoding.phpt ├── cld2_language.phpt └── cld2_detector.phpt ├── php_cld2.h ├── .travis.yml ├── .travis └── prepare.sh ├── .gitignore ├── config.m4 ├── encodings.h ├── README.md ├── LICENSE └── cld2.cc /tests/skipif.inc: -------------------------------------------------------------------------------- 1 | 5 | -------------------------------------------------------------------------------- /php_cld2.h: -------------------------------------------------------------------------------- 1 | #ifndef PHP_CLD2_H 2 | #define PHP_CLD2_H 3 | 4 | #define PHP_CLD2_EXTNAME "cld2" 5 | #define PHP_CLD2_EXTVER "0.1" 6 | 7 | #ifdef HAVE_CONFIG_H 8 | #include "config.h" 9 | #endif 10 | 11 | extern "C" { 12 | #include "php.h" 13 | } 14 | 15 | #endif /* PHP_CLD2_H */ 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | before_script: 4 | - ./.travis/prepare.sh 5 | 6 | language: php 7 | 8 | php: 9 | - 5.6 10 | - 5.5 11 | - 5.4 12 | - 5.3 13 | 14 | script: 15 | - phpize 16 | - ./configure --with-cld2=libcld2 17 | - make 18 | - yes "s" | make test 19 | - sudo make install 20 | -------------------------------------------------------------------------------- /.travis/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | sudo apt-get install git-core git-svn 4 | 5 | echo "Cloning CLD2" 6 | git svn clone http://cld2.googlecode.com/svn/trunk/ libcld2 7 | 8 | echo "Compiling libcld2" 9 | cd libcld2/internal 10 | ./compile_libs.sh 11 | 12 | echo "Installing libcld2" 13 | sudo cp libcld2.so /usr/local/lib 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ycm_extra_conf.py* 2 | tags 3 | .ctags 4 | cld2-master/ 5 | libcld2/ 6 | libcld2.so 7 | libcld2_full.so 8 | .deps 9 | .libs/ 10 | Makefile 11 | Makefile.fragments 12 | Makefile.global 13 | Makefile.objects 14 | acinclude.m4 15 | aclocal.m4 16 | autom4te.cache/ 17 | build/ 18 | cld2.la 19 | cld2wrapper.lo 20 | config.guess 21 | config.h 22 | config.h.in 23 | config.log 24 | config.nice 25 | config.status 26 | config.sub 27 | configure 28 | configure.in 29 | install-sh 30 | libcld2.lo 31 | libtool 32 | ltmain.sh 33 | missing 34 | mkinstalldirs 35 | modules/ 36 | run-tests.php 37 | *.loT 38 | *.lo 39 | -------------------------------------------------------------------------------- /tests/cld2_encoding.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | CLD2Encoding Tests 3 | --DESCRIPTION-- 4 | Tests CLD2Encoding static methods and properties 5 | --SKIPIF-- 6 | 7 | --FILE-- 8 | 16 | --EXPECT-- 17 | object(CLD2Encoding)#1 (0) { 18 | } 19 | int(22) 20 | int(63) 21 | int(74) 22 | string(4) "UTF8" 23 | string(20) "SOFTBANK_ISO_2022_JP" -------------------------------------------------------------------------------- /config.m4: -------------------------------------------------------------------------------- 1 | dnl config.m4 for extension cld2 2 | 3 | PHP_ARG_WITH(cld2, whether to enable cld2 support, 4 | [ --with-cld2[=DIR] Include cld2 support]) 5 | 6 | 7 | if test "$PHP_CLD2" != "no"; then 8 | 9 | if test "$PHP_CLD2" = "yes"; then 10 | AC_MSG_ERROR([cld2 path not provided]) 11 | fi 12 | 13 | if ! test -r $PHP_CLD2/internal/compact_lang_det_impl.h; then 14 | AC_MSG_ERROR([cld2 not found at provided path]) 15 | fi 16 | 17 | 18 | PHP_REQUIRE_CXX() 19 | 20 | PHP_ADD_INCLUDE($PHP_CLD2) 21 | PHP_ADD_LIBRARY(stdc++, 1, CLD2_SHARED_LIBADD) 22 | PHP_ADD_LIBRARY_WITH_PATH(cld2, $PHP_CLD2/internal, CLD2_SHARED_LIBADD) 23 | PHP_SUBST(CLD2_SHARED_LIBADD) 24 | 25 | PHP_NEW_EXTENSION(cld2, cld2.cc, $ext_shared,,,1) 26 | fi 27 | -------------------------------------------------------------------------------- /tests/cld2_language.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | CLD2Language Tests 3 | --DESCRIPTION-- 4 | Tests CLD2Language static methods and properties 5 | --SKIPIF-- 6 | 7 | --FILE-- 8 | 19 | --EXPECT-- 20 | int(7) 21 | string(7) "ITALIAN" 22 | string(7) "ENGLISH" 23 | string(3) "pam" 24 | int(176) 25 | int(176) 26 | int(0) 27 | string(7) "Unknown" 28 | string(2) "un" 29 | -------------------------------------------------------------------------------- /tests/cld2_detector.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | CLD2Detector Tests 3 | --DESCRIPTION-- 4 | Tests CLD2Detector Class methods and properties 5 | --SKIPIF-- 6 | 7 | --FILE-- 8 | isPlainText()); 12 | $cld2->setPlainText(true); 13 | var_dump($cld2->isPlainText()); 14 | var_dump($cld2->getTldHint()); 15 | $cld2->setTldHint("com"); 16 | var_dump($cld2->tldHint); 17 | var_dump($cld2->getLanguageHint()); 18 | $cld2->setLanguageHint(CLD2Language::ITALIAN); 19 | var_dump($cld2->languageHint); 20 | var_dump($cld2->getEncodingHint()); 21 | $cld2->setEncodingHint(CLD2Encoding::UTF8); 22 | var_dump($cld2->encodingHint); 23 | var_dump($cld2->detect('My name is Melissa')); 24 | ?> 25 | --EXPECT-- 26 | object(CLD2Detector)#1 (4) { 27 | ["isPlainText"]=> 28 | bool(false) 29 | ["tldHint"]=> 30 | string(0) "" 31 | ["languageHint"]=> 32 | int(26) 33 | ["encodingHint"]=> 34 | int(23) 35 | } 36 | bool(false) 37 | bool(true) 38 | string(0) "" 39 | string(3) "com" 40 | int(26) 41 | int(7) 42 | int(23) 43 | int(22) 44 | array(5) { 45 | ["language_id"]=> 46 | int(0) 47 | ["language_code"]=> 48 | string(2) "en" 49 | ["language_name"]=> 50 | string(7) "ENGLISH" 51 | ["language_probability"]=> 52 | int(95) 53 | ["is_reliable"]=> 54 | bool(true) 55 | } 56 | -------------------------------------------------------------------------------- /encodings.h: -------------------------------------------------------------------------------- 1 | #ifndef PHP_CLD2_ENCODINGS_H 2 | #define PHP_CLD2_ENCODINGS_H 3 | 4 | static const char *encodingStrings[] = { 5 | "ISO_8859_1", 6 | "ISO_8859_2", 7 | "ISO_8859_3", 8 | "ISO_8859_4", 9 | "ISO_8859_5", 10 | "ISO_8859_6", 11 | "ISO_8859_7", 12 | "ISO_8859_8", 13 | "ISO_8859_9", 14 | "ISO_8859_10", 15 | "JAPANESE_EUC_JP", 16 | "JAPANESE_SHIFT_JIS", 17 | "JAPANESE_JIS", 18 | "CHINESE_BIG5", 19 | "CHINESE_GB", 20 | "CHINESE_EUC_CN", 21 | "KOREAN_EUC_KR", 22 | "UNICODE_UNUSED", 23 | "CHINESE_EUC_DEC", 24 | "CHINESE_CNS", 25 | "CHINESE_BIG5_CP950", 26 | "JAPANESE_CP932", 27 | "UTF8", 28 | "UNKNOWN_ENCODING", 29 | "ASCII_7BIT", 30 | "RUSSIAN_KOI8_R", 31 | "RUSSIAN_CP1251", 32 | "MSFT_CP1252", 33 | "RUSSIAN_KOI8_RU", 34 | "MSFT_CP1250", 35 | "ISO_8859_15", 36 | "MSFT_CP1254", 37 | "MSFT_CP1257", 38 | "ISO_8859_11", 39 | "MSFT_CP874", 40 | "MSFT_CP1256", 41 | "MSFT_CP1255", 42 | "ISO_8859_8_I", 43 | "HEBREW_VISUAL", 44 | "CZECH_CP852", 45 | "CZECH_CSN_369103", 46 | "MSFT_CP1253", 47 | "RUSSIAN_CP866", 48 | "ISO_8859_13", 49 | "ISO_2022_KR", 50 | "GBK", 51 | "GB18030", 52 | "BIG5_HKSCS", 53 | "ISO_2022_CN", 54 | "TSCII", 55 | "TAMIL_MONO", 56 | "TAMIL_BI", 57 | "JAGRAN", 58 | "MACINTOSH_ROMAN", 59 | "UTF7", 60 | "BHASKAR", 61 | "HTCHANAKYA", 62 | "UTF16BE", 63 | "UTF16LE", 64 | "UTF32BE", 65 | "UTF32LE", 66 | "BINARYENC", 67 | "HZ_GB_2312", 68 | "UTF8UTF8", 69 | "TAM_ELANGO", 70 | "TAM_LTTMBARANI", 71 | "TAM_SHREE", 72 | "TAM_TBOOMIS", 73 | "TAM_TMNEWS", 74 | "TAM_WEBTAMIL", 75 | "KDDI_SHIFT_JIS", 76 | "DOCOMO_SHIFT_JIS", 77 | "SOFTBANK_SHIFT_JIS", 78 | "KDDI_ISO_2022_JP", 79 | "SOFTBANK_ISO_2022_JP", 80 | }; 81 | 82 | #endif // PHP_CLD2_ENCODINGS_H 83 | 84 | 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Compact Language Detector 2 PHP Extension 2 | 3 | This extension wraps [CLD2](https://github.com/CLD2Owners/cld2) (Compact Language Detector 2) that detects over 80 languages in Unicode UTF-8 text. 4 | 5 | ## Usage 6 | 7 | ### Detector 8 | 9 | ```php 10 | $cld2 = new \CLD2Detector(); 11 | 12 | $cld2->setTldHint('it'); // optional, hints about the Top level domain (it: italian, fr: french, de: german etc..) 13 | $cld2->setLanguageHint(CLD2Language::GERMAN); // optional, hints about the language. 14 | $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding 15 | 16 | var_dump($cld2->detect('My name is Melissa')); 17 | 18 | // Output 19 | array(5) { 20 | 'language_id' => 21 | int(0) 22 | 'language_code' => 23 | string(2) "en" 24 | 'language_name' => 25 | string(7) "ENGLISH" 26 | 'language_probability' => 27 | int(95) 28 | 'is_reliable' => 29 | bool(true) 30 | } 31 | 32 | ``` 33 | 34 | 35 | ### Language 36 | 37 | Get language name from Language id 38 | ```php 39 | $lang = CLD2Language::PAMPANGA; 40 | echo CLD2Language::languageName($lang); 41 | 42 | // Output 43 | string(8) "PAMPANGA" 44 | ``` 45 | 46 | Get language code from Language id 47 | ```php 48 | echo CLD2Language::languageCode(CLD2Language::PAMPANGA); 49 | 50 | // Output 51 | string(3) "pam" 52 | ``` 53 | 54 | Get language id from name where Name can be either full name or ISO code, or can be ISO code embedded in a language-script combination such as "en-Latn-GB" 55 | ```php 56 | echo CLD2Language::getLanguageFromName('PAMPANGA'); 57 | // or 58 | echo CLD2Language::getLanguageFromName('pam'); 59 | 60 | // Output 61 | int(176) 62 | ``` 63 | 64 | ### Encoding 65 | 66 | Get encoding name from id 67 | ```php 68 | $enc = CLD2Encoding::UTF8; 69 | echo CLD2Encoding::encodingName($enc); 70 | 71 | // Output 72 | string(4) "UTF8" 73 | ``` 74 | 75 | ## Installation 76 | 77 | ## From PECL 78 | Not submitted yet. 79 | 80 | ## Build from source 81 | 82 | First of all clone current repository and enter into its directory. 83 | 84 | ```bash 85 | git clone git@github.com:fntlnz/cld2-php-ext.git 86 | cd cld2-php-ext 87 | ``` 88 | 89 | **Compile CLD2 library** 90 | 91 | ```bash 92 | wget -nv -O - https://github.com/CLD2Owners/cld2/archive/master.tar.gz | tar zx 93 | cd cld2-master/internal 94 | CFLAGS="-Wno-narrowing" ./compile_libs.sh 95 | sudo cp libcld2.so /usr/local/lib 96 | ``` 97 | 98 | **Compile CLD2 PHP extension** 99 | 100 | Come back to the `cld2-php-ext` directory and execute: 101 | 102 | ```bash 103 | phpize 104 | ./configure --with-cld2=cld2-master 105 | make -j 106 | sudo make install 107 | ``` 108 | 109 | Do not forget to add `extension=cld2.so` to your PHP ini. 110 | 111 | If you have a libtool version mismatch, this may help: 112 | 113 | ```bash 114 | rm aclocal.m4 115 | autoreconf -i 116 | ``` 117 | 118 | ## Contributing 119 | See [CONTRIBUTING.md](CONTRIBUTING.md) 120 | 121 | ## NOTES 122 | 123 | - [CLD2 library](https://github.com/CLD2Owners/cld2) 124 | 125 | [![Analytics](https://ga-beacon.appspot.com/UA-45983436-1/fntlnz/cld2-php-ext)](https://github.com/igrigorik/ga-beacon) 126 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2014 Lorenzo Fontana 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /cld2.cc: -------------------------------------------------------------------------------- 1 | #include "php_cld2.h" 2 | #include "internal/compact_lang_det_impl.h" 3 | #include "encodings.h" 4 | #include "public/encodings.h" 5 | 6 | zend_class_entry *cld2_detector_ce; 7 | zend_class_entry *cld2_language_ce; 8 | zend_class_entry *cld2_encoding_ce; 9 | zend_object_handlers cld2_object_handlers; 10 | 11 | struct st_detected_language { 12 | CLD2::Language language; 13 | int probability; 14 | bool is_reliable; 15 | char* language_code; 16 | char* language_name; 17 | }; 18 | 19 | static st_detected_language detect_language(char *text, int text_len, bool is_plain_text, char *tld_hint, long language_hint, long encoding_hint) 20 | { 21 | int flags = 0; 22 | bool is_reliable; 23 | 24 | // Detect language 25 | CLD2::Language language3[3]; 26 | int percent3[3]; 27 | double normalized_score3[3]; 28 | int text_bytes; 29 | 30 | CLD2::CLDHints cld_hints = {NULL, tld_hint, (CLD2::Encoding) encoding_hint, (CLD2::Language) language_hint}; 31 | CLD2::Language lang = CLD2::ExtDetectLanguageSummary( 32 | text, 33 | text_len, 34 | is_plain_text, 35 | &cld_hints, 36 | flags, 37 | language3, 38 | percent3, 39 | normalized_score3, 40 | NULL, 41 | &text_bytes, 42 | &is_reliable); 43 | 44 | st_detected_language dl; 45 | dl.language = lang; 46 | dl.probability = percent3[0]; 47 | dl.is_reliable = is_reliable; 48 | dl.language_code = strdup(CLD2::LanguageCode(lang)); 49 | dl.language_name = strdup(CLD2::LanguageName(lang)); 50 | 51 | return dl; 52 | } 53 | 54 | static bool check_language(CLD2::Language lang) 55 | { 56 | return !(lang < 0 || lang > CLD2::NUM_LANGUAGES); 57 | } 58 | 59 | static bool check_encoding(long encoding) 60 | { 61 | return !(encoding < 0 || encoding > CLD2::NUM_ENCODINGS); 62 | } 63 | 64 | /* ============================================ */ 65 | /* Detector */ 66 | /* ============================================ */ 67 | 68 | 69 | ZEND_BEGIN_ARG_INFO(arginfo_detector_detect, 0) 70 | ZEND_ARG_INFO(0, text) 71 | ZEND_END_ARG_INFO() 72 | 73 | ZEND_BEGIN_ARG_INFO(arginfo_detector_is_plain_text, 0) 74 | ZEND_END_ARG_INFO() 75 | 76 | ZEND_BEGIN_ARG_INFO(arginfo_detector_set_plain_text, 0) 77 | ZEND_ARG_INFO(0, plainText) 78 | ZEND_END_ARG_INFO() 79 | 80 | ZEND_BEGIN_ARG_INFO(arginfo_detector_get_tld_hint, 0) 81 | ZEND_END_ARG_INFO() 82 | 83 | ZEND_BEGIN_ARG_INFO(arginfo_detector_set_tld_hint, 0) 84 | ZEND_ARG_INFO(0, tldHint) 85 | ZEND_END_ARG_INFO() 86 | 87 | ZEND_BEGIN_ARG_INFO(arginfo_detector_get_language_hint, 0) 88 | ZEND_END_ARG_INFO() 89 | 90 | ZEND_BEGIN_ARG_INFO(arginfo_detector_set_language_hint, 0) 91 | ZEND_ARG_INFO(0, languageHint) 92 | ZEND_END_ARG_INFO() 93 | 94 | ZEND_BEGIN_ARG_INFO(arginfo_detector_get_encoding_hint, 0) 95 | ZEND_END_ARG_INFO() 96 | 97 | ZEND_BEGIN_ARG_INFO(arginfo_detector_set_encoding_hint, 0) 98 | ZEND_ARG_INFO(0, encodingHint) 99 | ZEND_END_ARG_INFO() 100 | 101 | 102 | /** {{{ proto array CLD2Detector::detect(string $text) 103 | perform the language detection of the given text */ 104 | PHP_METHOD(cld2_detector, detect) 105 | { 106 | zval *detector, *is_plain, *tld_hint, *language_hint, *encoding_hint; 107 | char *text; 108 | int text_len; 109 | 110 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "Os", &detector, cld2_detector_ce, &text, &text_len) == FAILURE) { 111 | RETURN_NULL(); 112 | } 113 | zval rv1, rv2, rv3, rv4; 114 | 115 | is_plain = zend_read_property(cld2_detector_ce, detector, "isPlainText", sizeof("isPlainText") - 1, 1, &rv1); 116 | tld_hint = zend_read_property(cld2_detector_ce, detector, "tldHint", sizeof("tldHint") - 1, 1, &rv2); 117 | language_hint = zend_read_property(cld2_detector_ce, detector, "languageHint", sizeof("languageHint") - 1, 1, &rv3); 118 | encoding_hint = zend_read_property(cld2_detector_ce, detector, "encodingHint", sizeof("encodingHint") - 1, 1, &rv4); 119 | 120 | st_detected_language dl = detect_language(text, text_len, (bool) is_plain, (char *) tld_hint, (long) language_hint, (long) encoding_hint); 121 | 122 | 123 | // Prepare array 124 | array_init(return_value); 125 | add_assoc_long(return_value, "language_id", (int) (dl.language)); 126 | add_assoc_string(return_value, "language_code", dl.language_code); 127 | add_assoc_string(return_value, "language_name", dl.language_name); 128 | add_assoc_long(return_value, "language_probability", dl.probability); 129 | add_assoc_bool(return_value, "is_reliable", dl.is_reliable); 130 | } 131 | /* }}} */ 132 | 133 | /** {{{ proto bool CLD2Detector::isPlainText() 134 | check whether or not CLD2 have to skip html tags and expand html entities.*/ 135 | PHP_METHOD(cld2_detector, isPlainText) 136 | { 137 | zval *detector, *is_plain; 138 | 139 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "O", &detector, cld2_detector_ce) == FAILURE) { 140 | RETURN_NULL(); 141 | } 142 | zval rv; 143 | is_plain = zend_read_property(cld2_detector_ce, detector, "isPlainText", sizeof("isPlainText") -1, 1, &rv); 144 | 145 | RETVAL_ZVAL(is_plain, 1, 0); 146 | } /* }}} */ 147 | 148 | /* {{{ proto CLD2Detector::setPlainText(bool $isPlainText) 149 | enabling this option tells CLD2 to not skip html tags and not expand html 150 | entities */ 151 | PHP_METHOD(cld2_detector, setPlainText) 152 | { 153 | zval *detector; 154 | zend_bool is_plain; 155 | 156 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "Ob", &detector, cld2_detector_ce, &is_plain) == FAILURE) { 157 | RETURN_NULL(); 158 | } 159 | 160 | zend_update_property_bool(cld2_detector_ce, detector, "isPlainText", sizeof("isPlainText") - 1, is_plain); 161 | } /* }}} */ 162 | 163 | /* {{{ proto string CLD2Detector::getTldHint() */ 164 | PHP_METHOD(cld2_detector, getTldHint) 165 | { 166 | zval *detector, *tld_hint; 167 | 168 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "O", &detector, cld2_detector_ce) == FAILURE) { 169 | RETURN_NULL(); 170 | } 171 | zval rv; 172 | tld_hint = zend_read_property(cld2_detector_ce, detector, "tldHint", sizeof("tldHint") - 1, 1, &rv); 173 | 174 | RETVAL_ZVAL(tld_hint, 1, 0); 175 | } /* }}} */ 176 | 177 | /* {{{ proto CLD2Detector::setTldHint(string $hint) */ 178 | PHP_METHOD(cld2_detector, setTldHint) 179 | { 180 | zval *detector; 181 | 182 | char *hint = NULL; 183 | int hint_len; 184 | 185 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "Os", &detector, cld2_detector_ce, &hint, &hint_len) == FAILURE) { 186 | RETURN_NULL(); 187 | } 188 | 189 | zend_update_property_string(cld2_detector_ce, getThis(), "tldHint", sizeof("tldHint") - 1, hint); 190 | } /* }}} */ 191 | 192 | /* {{{ proto int CLD2Detector::getLanguageHint() */ 193 | PHP_METHOD(cld2_detector, getLanguageHint) 194 | { 195 | zval *detector, *languageHint; 196 | 197 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "O", &detector, cld2_detector_ce) == FAILURE) { 198 | RETURN_NULL(); 199 | } 200 | 201 | zval rv; 202 | 203 | languageHint = zend_read_property(cld2_detector_ce, detector, "languageHint", sizeof("languageHint") - 1, 1, &rv); 204 | 205 | RETVAL_ZVAL(languageHint, 1, 0); 206 | } /* }}} */ 207 | 208 | /* {{{ proto CLD2Detector::setLanguageHint(int $hint) */ 209 | PHP_METHOD(cld2_detector, setLanguageHint) 210 | { 211 | zval *detector; 212 | long hint; 213 | 214 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "Ol", &detector, cld2_detector_ce, &hint) == FAILURE) { 215 | RETURN_NULL(); 216 | } 217 | 218 | zend_update_property_long(cld2_detector_ce, detector, "languageHint", sizeof("languageHint") - 1, hint); 219 | } /* }}} */ 220 | 221 | /* {{{ proto CLD2Detector::setEncodingHint(int $hint) */ 222 | PHP_METHOD(cld2_detector, setEncodingHint) 223 | { 224 | zval *detector; 225 | long hint; 226 | 227 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "Ol", &detector, cld2_detector_ce, &hint) == FAILURE) { 228 | RETURN_NULL(); 229 | } 230 | 231 | zend_update_property_long(cld2_detector_ce, detector, "encodingHint", sizeof("encodingHint") - 1, hint); 232 | } /* }}} */ 233 | 234 | /* {{{ proto int CLD2Detector::getEncodingHint() */ 235 | PHP_METHOD(cld2_detector, getEncodingHint) 236 | { 237 | zval *detector, *encodingHint, rv; 238 | 239 | if (zend_parse_method_parameters(ZEND_NUM_ARGS(), getThis(), "O", &detector, cld2_detector_ce) == FAILURE) { 240 | RETURN_NULL(); 241 | } 242 | 243 | encodingHint = zend_read_property(cld2_detector_ce, detector, "encodingHint", sizeof("encodingHint") -1, 1, &rv); 244 | 245 | RETVAL_ZVAL(encodingHint, 1, 0); 246 | } /* }}} */ 247 | 248 | zend_function_entry cld2_methods[] = { 249 | PHP_ME(cld2_detector, detect, arginfo_detector_detect, ZEND_ACC_PUBLIC) 250 | PHP_ME(cld2_detector, isPlainText, arginfo_detector_is_plain_text, ZEND_ACC_PUBLIC) 251 | PHP_ME(cld2_detector, setPlainText, arginfo_detector_set_plain_text, ZEND_ACC_PUBLIC) 252 | PHP_ME(cld2_detector, getTldHint, arginfo_detector_get_tld_hint, ZEND_ACC_PUBLIC) 253 | PHP_ME(cld2_detector, setTldHint, arginfo_detector_set_tld_hint, ZEND_ACC_PUBLIC) 254 | PHP_ME(cld2_detector, getLanguageHint, arginfo_detector_get_language_hint, ZEND_ACC_PUBLIC) 255 | PHP_ME(cld2_detector, setLanguageHint, arginfo_detector_set_language_hint, ZEND_ACC_PUBLIC) 256 | PHP_ME(cld2_detector, getEncodingHint, arginfo_detector_get_encoding_hint, ZEND_ACC_PUBLIC) 257 | PHP_ME(cld2_detector, setEncodingHint, arginfo_detector_set_encoding_hint, ZEND_ACC_PUBLIC) 258 | {NULL, NULL, NULL} 259 | }; 260 | 261 | 262 | /* ============================================ */ 263 | /* Language */ 264 | /* ============================================ */ 265 | 266 | ZEND_BEGIN_ARG_INFO(arginfo_language_language_name, 0) 267 | ZEND_ARG_INFO(0, id) 268 | ZEND_END_ARG_INFO() 269 | 270 | ZEND_BEGIN_ARG_INFO(arginfo_language_language_code, 0) 271 | ZEND_ARG_INFO(0, id) 272 | ZEND_END_ARG_INFO() 273 | 274 | ZEND_BEGIN_ARG_INFO(arginfo_language_get_language_from_name, 0) 275 | ZEND_ARG_INFO(0, name) 276 | ZEND_END_ARG_INFO() 277 | 278 | /* {{{ proto string CLD2Language::languageName(int $language) */ 279 | PHP_METHOD(cld2_language, languageName) 280 | { 281 | long language; 282 | 283 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &language) == FAILURE) { 284 | RETURN_NULL(); 285 | } 286 | 287 | CLD2::Language l = (CLD2::Language) (language); 288 | 289 | if (!check_language(l)) { 290 | RETURN_STRING(CLD2::LanguageName(CLD2::UNKNOWN_LANGUAGE)); 291 | }; 292 | 293 | 294 | RETURN_STRING(CLD2::LanguageName(l)); 295 | } /* }}} */ 296 | 297 | /* {{{ proto string CLD2Language::languageCode(int $language) */ 298 | PHP_METHOD(cld2_language, languageCode) 299 | { 300 | long language; 301 | 302 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &language) == FAILURE) { 303 | RETURN_NULL(); 304 | } 305 | 306 | CLD2::Language l = (CLD2::Language) (language); 307 | 308 | if (!check_language(l)) { 309 | RETURN_STRING(CLD2::LanguageCode(CLD2::UNKNOWN_LANGUAGE)); 310 | }; 311 | 312 | RETURN_STRING(CLD2::LanguageCode(l)); 313 | } /* }}} */ 314 | 315 | /* {{{ proto int CLD2Language::getLanguageFromName(string name) */ 316 | PHP_METHOD(cld2_language, getLanguageFromName) 317 | { 318 | char *name; 319 | int name_len; 320 | 321 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &name, &name_len) == FAILURE) { 322 | RETURN_NULL(); 323 | } 324 | 325 | RETURN_LONG(CLD2::GetLanguageFromName(name)); 326 | } /* }}} */ 327 | 328 | zend_function_entry cld2_language_methods[] = { 329 | PHP_ME(cld2_language, languageName, arginfo_language_language_name, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) 330 | PHP_ME(cld2_language, languageCode, arginfo_language_language_code, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) 331 | PHP_ME(cld2_language, getLanguageFromName, arginfo_language_get_language_from_name, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) 332 | {NULL, NULL, NULL} 333 | }; 334 | 335 | 336 | /* ============================================ */ 337 | /* Encoding */ 338 | /* ============================================ */ 339 | 340 | ZEND_BEGIN_ARG_INFO(arginfo_encoding_encoding_name, 0) 341 | ZEND_ARG_INFO(0, id) 342 | ZEND_END_ARG_INFO() 343 | 344 | /* {{{ proto string CLD2Encoding::encodingName(int $encoding) */ 345 | PHP_METHOD(cld2_encoding, encodingName) 346 | { 347 | long encoding; 348 | 349 | if (zend_parse_parameters(ZEND_NUM_ARGS(), "l", &encoding) == FAILURE) { 350 | RETURN_NULL(); 351 | } 352 | 353 | if (!check_encoding(encoding)) { 354 | RETURN_STRING(encodingStrings[CLD2::UNKNOWN_ENCODING]); 355 | } 356 | 357 | RETURN_STRING(encodingStrings[encoding]); 358 | } /* }}} */ 359 | 360 | zend_function_entry cld2_encoding_methods[] = { 361 | PHP_ME(cld2_encoding, encodingName, arginfo_encoding_encoding_name, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) 362 | {NULL, NULL, NULL} 363 | }; 364 | 365 | PHP_MINIT_FUNCTION(cld2) 366 | { 367 | // Detector 368 | zend_class_entry ce; 369 | INIT_CLASS_ENTRY(ce, "CLD2Detector", cld2_methods); 370 | cld2_detector_ce = zend_register_internal_class(&ce); 371 | zend_declare_property_bool(cld2_detector_ce, "isPlainText", sizeof("isPlainText") - 1 , 0, ZEND_ACC_PUBLIC); 372 | zend_declare_property_string(cld2_detector_ce, "tldHint", sizeof("tldHint") - 1, "", ZEND_ACC_PUBLIC); 373 | zend_declare_property_long(cld2_detector_ce, "languageHint", sizeof("languageHint") - 1, CLD2::UNKNOWN_LANGUAGE, ZEND_ACC_PUBLIC); 374 | zend_declare_property_long(cld2_detector_ce, "encodingHint", sizeof("encodingHint") - 1, CLD2::UNKNOWN_ENCODING, ZEND_ACC_PUBLIC); 375 | 376 | // Language 377 | zend_class_entry ce_Language; 378 | INIT_CLASS_ENTRY(ce_Language, "CLD2Language", cld2_language_methods); 379 | cld2_language_ce = zend_register_internal_class(&ce_Language); 380 | 381 | for (int i = 0; i < CLD2::NUM_LANGUAGES; i++) { 382 | CLD2::Language lan = (CLD2::Language)(i); 383 | zend_declare_class_constant_long(cld2_language_ce, CLD2::LanguageName(lan), strlen(CLD2::LanguageName(lan)), i); 384 | } 385 | 386 | // Encoding 387 | zend_class_entry ce_Encoding; 388 | INIT_CLASS_ENTRY(ce_Encoding, "CLD2Encoding", cld2_encoding_methods); 389 | cld2_encoding_ce = zend_register_internal_class(&ce_Encoding); 390 | 391 | for (int i = 0; i < CLD2::NUM_ENCODINGS; i++) { 392 | CLD2::Encoding enc = (CLD2::Encoding) (i); 393 | zend_declare_class_constant_long(cld2_encoding_ce, encodingStrings[i], strlen(encodingStrings[i]), i); 394 | } 395 | 396 | memcpy(&cld2_object_handlers, zend_get_std_object_handlers(), sizeof(zend_object_handlers)); 397 | cld2_object_handlers.clone_obj = NULL; 398 | 399 | return SUCCESS; 400 | } 401 | 402 | zend_module_entry cld2_module_entry = { 403 | #if ZEND_MODULE_API_NO >= 20010901 404 | STANDARD_MODULE_HEADER, 405 | #endif 406 | PHP_CLD2_EXTNAME, 407 | NULL, /* Functions */ 408 | PHP_MINIT(cld2), /* MINIT */ 409 | NULL, /* MSHUTDOWN */ 410 | NULL, /* RINIT */ 411 | NULL, /* RSHUTDOWN */ 412 | NULL, /* MINFO */ 413 | #if ZEND_MODULE_API_NO >= 20010901 414 | PHP_CLD2_EXTVER, 415 | #endif 416 | STANDARD_MODULE_PROPERTIES 417 | }; 418 | 419 | #ifdef COMPILE_DL_CLD2 420 | extern "C" { 421 | ZEND_GET_MODULE(cld2) 422 | } 423 | #endif 424 | 425 | /* 426 | * Local variables: 427 | * tab-width: 4 428 | * c-basic-offset: 4 429 | * End: 430 | * vim600: sw=4 ts=4 fdm=marker 431 | * vim<600: sw=4 ts=4 432 | */ 433 | --------------------------------------------------------------------------------