├── .gitignore ├── Makefile.buildbot ├── README ├── encodings.cc ├── pycldmodule.cc ├── setup.py ├── setup_full.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /Makefile.buildbot: -------------------------------------------------------------------------------- 1 | 2 | test: 3 | python setup.py build_ext --inplace 4 | python setup_full.py build_ext --inplace 5 | python test.py 6 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Dick Sites (and others) at Google graciously provided a new version 2 | 2.0 of the compact language detector, here: 3 | 4 | https://code.google.com/p/cld2/ 5 | 6 | and I (lucene@mikemccandless.com) created the Python bindings and 7 | ported the C++ test case to test.py. 8 | 9 | This has been tested on Ubuntu 12.10, with both Python 2.7.3 and 10 | 3.2.3. 11 | 12 | To build: 13 | 14 | * First checkout cld2 and run internal/compile_libs.sh. This will 15 | create both libcld2.so (detects 83 languages) and libcld2_full.so 16 | (detects 163 languages). Install those libraries somewhere on 17 | your LD_LIBRARY_PATH, for example copy them into /usr/lib64. 18 | 19 | * Define the CLD2_PATH environment variable to point to where you 20 | checked out the CLD2 sources: export CLD2_PATH='/path/to/cld2' 21 | 22 | * python setup.py build 23 | 24 | * python setup_full.py build 25 | 26 | Note that all Python sources work with both python 2.x and 3.x so if 27 | you want to install for python3.x just repeat the above steps using 28 | python3 (or whatever python command runs python 3.x in your 29 | environment). 30 | 31 | To test both the small and full language tables: 32 | 33 | * python test.py 34 | 35 | The test produces a lot of output, due to the test cases testing the 36 | debug flags; this is normal. As long as it says OK in the end then 37 | the tests passed. 38 | 39 | To install: 40 | 41 | * python setup.py install (as root) 42 | 43 | * python setup_full.py install (as root) 44 | 45 | For documentation run: 46 | 47 | * python -c "import cld2; help(cld2.detect)" 48 | 49 | NOTE: you must pass only valid UTF-8 bytes to the detect function, 50 | otherwise you can hit segmentation fault or get incorrect results. 51 | -------------------------------------------------------------------------------- /encodings.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | // 14 | 15 | #include 16 | #include 17 | #include "compact_lang_det.h" 18 | #include "encodings.h" 19 | 20 | struct cld_encoding { 21 | const char *name; 22 | CLD2::Encoding encoding; 23 | }; 24 | 25 | extern const cld_encoding cld_encoding_info[] = { 26 | {"ISO_8859_1", CLD2::ISO_8859_1}, 27 | {"ISO_8859_2", CLD2::ISO_8859_2}, 28 | {"ISO_8859_3", CLD2::ISO_8859_3}, 29 | {"ISO_8859_4", CLD2::ISO_8859_4}, 30 | {"ISO_8859_5", CLD2::ISO_8859_5}, 31 | {"ISO_8859_6", CLD2::ISO_8859_6}, 32 | {"ISO_8859_7", CLD2::ISO_8859_7}, 33 | {"ISO_8859_8", CLD2::ISO_8859_8}, 34 | {"ISO_8859_9", CLD2::ISO_8859_9}, 35 | {"ISO_8859_10", CLD2::ISO_8859_10}, 36 | {"JAPANESE_EUC_JP", CLD2::JAPANESE_EUC_JP}, 37 | {"JAPANESE_SHIFT_JIS", CLD2::JAPANESE_SHIFT_JIS}, 38 | {"JAPANESE_JIS", CLD2::JAPANESE_JIS}, 39 | {"CHINESE_BIG5", CLD2::CHINESE_BIG5}, 40 | {"CHINESE_GB", CLD2::CHINESE_GB}, 41 | {"CHINESE_EUC_CN", CLD2::CHINESE_EUC_CN}, 42 | {"KOREAN_EUC_KR", CLD2::KOREAN_EUC_KR}, 43 | {"UNICODE_UNUSED", CLD2::UNICODE_UNUSED}, 44 | {"CHINESE_EUC_DEC", CLD2::CHINESE_EUC_DEC}, 45 | {"CHINESE_CNS", CLD2::CHINESE_CNS}, 46 | {"CHINESE_BIG5_CP950", CLD2::CHINESE_BIG5_CP950}, 47 | {"JAPANESE_CP932", CLD2::JAPANESE_CP932}, 48 | {"UTF8", CLD2::UTF8}, 49 | {"UNKNOWN_ENCODING", CLD2::UNKNOWN_ENCODING}, 50 | {"ASCII_7BIT", CLD2::ASCII_7BIT}, 51 | {"RUSSIAN_KOI8_R", CLD2::RUSSIAN_KOI8_R}, 52 | {"RUSSIAN_CP1251", CLD2::RUSSIAN_CP1251}, 53 | {"MSFT_CP1252", CLD2::MSFT_CP1252}, 54 | {"RUSSIAN_KOI8_RU", CLD2::RUSSIAN_KOI8_RU}, 55 | {"MSFT_CP1250", CLD2::MSFT_CP1250}, 56 | {"ISO_8859_15", CLD2::ISO_8859_15}, 57 | {"MSFT_CP1254", CLD2::MSFT_CP1254}, 58 | {"MSFT_CP1257", CLD2::MSFT_CP1257}, 59 | {"ISO_8859_11", CLD2::ISO_8859_11}, 60 | {"MSFT_CP874", CLD2::MSFT_CP874}, 61 | {"MSFT_CP1256", CLD2::MSFT_CP1256}, 62 | {"MSFT_CP1255", CLD2::MSFT_CP1255}, 63 | {"ISO_8859_8_I", CLD2::ISO_8859_8_I}, 64 | {"HEBREW_VISUAL", CLD2::HEBREW_VISUAL}, 65 | {"CZECH_CP852", CLD2::CZECH_CP852}, 66 | {"CZECH_CSN_369103", CLD2::CZECH_CSN_369103}, 67 | {"MSFT_CP1253", CLD2::MSFT_CP1253}, 68 | {"RUSSIAN_CP866", CLD2::RUSSIAN_CP866}, 69 | {"ISO_8859_13", CLD2::ISO_8859_13}, 70 | {"ISO_2022_KR", CLD2::ISO_2022_KR}, 71 | {"GBK", CLD2::GBK}, 72 | {"GB18030", CLD2::GB18030}, 73 | {"BIG5_HKSCS", CLD2::BIG5_HKSCS}, 74 | {"ISO_2022_CN", CLD2::ISO_2022_CN}, 75 | {"TSCII", CLD2::TSCII}, 76 | {"TAMIL_MONO", CLD2::TAMIL_MONO}, 77 | {"TAMIL_BI", CLD2::TAMIL_BI}, 78 | {"JAGRAN", CLD2::JAGRAN}, 79 | {"MACINTOSH_ROMAN", CLD2::MACINTOSH_ROMAN}, 80 | {"UTF7", CLD2::UTF7}, 81 | {"BHASKAR", CLD2::BHASKAR}, 82 | {"HTCHANAKYA", CLD2::HTCHANAKYA}, 83 | {"UTF16BE", CLD2::UTF16BE}, 84 | {"UTF16LE", CLD2::UTF16LE}, 85 | {"UTF32BE", CLD2::UTF32BE}, 86 | {"UTF32LE", CLD2::UTF32LE}, 87 | {"BINARYENC", CLD2::BINARYENC}, 88 | {"HZ_GB_2312", CLD2::HZ_GB_2312}, 89 | {"UTF8UTF8", CLD2::UTF8UTF8}, 90 | {"TAM_ELANGO", CLD2::TAM_ELANGO}, 91 | {"TAM_LTTMBARANI", CLD2::TAM_LTTMBARANI}, 92 | {"TAM_SHREE", CLD2::TAM_SHREE}, 93 | {"TAM_TBOOMIS", CLD2::TAM_TBOOMIS}, 94 | {"TAM_TMNEWS", CLD2::TAM_TMNEWS}, 95 | {"TAM_WEBTAMIL", CLD2::TAM_WEBTAMIL}, 96 | {"KDDI_SHIFT_JIS", CLD2::KDDI_SHIFT_JIS}, 97 | {"DOCOMO_SHIFT_JIS", CLD2::DOCOMO_SHIFT_JIS}, 98 | {"SOFTBANK_SHIFT_JIS", CLD2::SOFTBANK_SHIFT_JIS}, 99 | {"KDDI_ISO_2022_JP", CLD2::KDDI_ISO_2022_JP}, 100 | {"SOFTBANK_ISO_2022_JP", CLD2::SOFTBANK_ISO_2022_JP}, 101 | }; 102 | 103 | CLD2::Encoding EncodingFromName(const char *name) { 104 | for(int i=0;i 16 | #include 17 | 18 | #if PY_MAJOR_VERSION >= 3 19 | #define IS_PY3K 20 | #endif 21 | 22 | #include "compact_lang_det.h" 23 | #include "encodings.h" 24 | 25 | // From ../../internal: 26 | #include "lang_script.h" 27 | 28 | // impl is in ./encodings.cc: 29 | CLD2::Encoding EncodingFromName(const char *name); 30 | 31 | struct cld_encoding { 32 | const char *name; 33 | CLD2::Encoding encoding; 34 | }; 35 | 36 | extern const cld_encoding cld_encoding_info[]; 37 | namespace CLD2 { 38 | extern const int kNameToLanguageSize; 39 | extern const CharIntPair kNameToLanguage[]; 40 | } 41 | 42 | struct PYCLDState { 43 | PyObject *error; 44 | }; 45 | 46 | #ifdef IS_PY3K 47 | #define GETSTATE(m) ((struct PYCLDState*)PyModule_GetState(m)) 48 | #else 49 | #define GETSTATE(m) (&_state) 50 | static struct PYCLDState _state; 51 | #endif 52 | 53 | static PyObject * 54 | detect(PyObject *self, PyObject *args, PyObject *kwArgs) { 55 | char *bytes; 56 | int numBytes; 57 | 58 | CLD2::CLDHints cldHints; 59 | cldHints.tld_hint = 0; 60 | cldHints.content_language_hint = 0; 61 | 62 | int isPlainText = 0; 63 | const char* hintLanguage = 0; 64 | const char* hintEncoding = 0; 65 | 66 | int returnVectors = 0; 67 | 68 | int flagScoreAsQuads = 0; 69 | int flagHTML = 0; 70 | int flagCR = 0; 71 | int flagVerbose = 0; 72 | int flagQuiet = 0; 73 | int flagEcho = 0; 74 | 75 | static const char *kwList[] = {"utf8Bytes", 76 | "isPlainText", 77 | "hintTopLevelDomain", // "id" boosts Indonesian 78 | "hintLanguage", // ITALIAN or it boosts it 79 | "hintLanguageHTTPHeaders", // "mi,en" boosts Maori and English 80 | "hintEncoding", // SJS boosts Japanese 81 | "returnVectors", // True if you want byte-ranges of each matched language (approx 2X perf hit) 82 | 83 | /* Normally, several languages are detected solely by their Unicode script. 84 | Combined with appropritate lookup tables, this flag forces them instead 85 | to be detected via quadgrams. This can be a useful refinement when looking 86 | for meaningful text in these languages, instead of just character sets. 87 | The default tables do not support this use. */ 88 | "debugScoreAsQuads", 89 | 90 | /* For each detection call, write an HTML file to stderr, showing the text 91 | chunks and their detected languages. */ 92 | "debugHTML", 93 | 94 | /* In that HTML file, force a new line for each chunk. */ 95 | "debugCR", 96 | 97 | /* In that HTML file, show every lookup entry. */ 98 | "debugVerbose", 99 | 100 | /* In that HTML file, suppress most of the output detail. */ 101 | "debugQuiet", 102 | 103 | /* Echo every input buffer to stderr. */ 104 | "debugEcho", 105 | 106 | NULL}; 107 | 108 | if (!PyArg_ParseTupleAndKeywords(args, kwArgs, "s#|izzzziiiiiii", 109 | (char **) kwList, 110 | &bytes, &numBytes, 111 | &isPlainText, 112 | &cldHints.tld_hint, 113 | &hintLanguage, 114 | &cldHints.content_language_hint, 115 | &hintEncoding, 116 | &returnVectors, 117 | &flagScoreAsQuads, 118 | &flagHTML, 119 | &flagCR, 120 | &flagVerbose, 121 | &flagQuiet, 122 | &flagEcho)) { 123 | return 0; 124 | } 125 | 126 | int flags = 0; 127 | if (flagScoreAsQuads != 0) { 128 | flags |= CLD2::kCLDFlagScoreAsQuads; 129 | } 130 | if (flagHTML != 0) { 131 | flags |= CLD2::kCLDFlagHtml; 132 | } 133 | if (flagCR != 0) { 134 | flags |= CLD2::kCLDFlagCr; 135 | } 136 | if (flagVerbose != 0) { 137 | flags |= CLD2::kCLDFlagVerbose; 138 | } 139 | if (flagQuiet != 0) { 140 | flags |= CLD2::kCLDFlagQuiet; 141 | } 142 | if (flagEcho != 0) { 143 | flags |= CLD2::kCLDFlagEcho; 144 | } 145 | 146 | PyObject *CLDError = GETSTATE(self)->error; 147 | 148 | if (hintLanguage == 0) { 149 | // no hint 150 | cldHints.language_hint = CLD2::UNKNOWN_LANGUAGE; 151 | } else { 152 | cldHints.language_hint = CLD2::GetLanguageFromName(hintLanguage); 153 | if (cldHints.language_hint == CLD2::UNKNOWN_LANGUAGE) { 154 | PyErr_Format(CLDError, "Unrecognized language hint name (got '%s'); see cld.LANGUAGES for recognized language names (note that currently external languages cannot be hinted)", hintLanguage); 155 | return 0; 156 | } 157 | } 158 | 159 | if (hintEncoding == 0) { 160 | // no hint 161 | cldHints.encoding_hint = CLD2::UNKNOWN_ENCODING; 162 | } else { 163 | cldHints.encoding_hint = EncodingFromName(hintEncoding); 164 | if (cldHints.encoding_hint == CLD2::UNKNOWN_ENCODING) { 165 | PyErr_Format(CLDError, "Unrecognized encoding hint code (got '%s'); see cld.ENCODINGS for recognized encodings", hintEncoding); 166 | return 0; 167 | } 168 | } 169 | 170 | bool isReliable; 171 | CLD2::Language language3[3]; 172 | int percent3[3]; 173 | double normalized_score3[3]; 174 | int textBytesFound; 175 | CLD2::ResultChunkVector resultChunkVector; 176 | 177 | Py_BEGIN_ALLOW_THREADS 178 | CLD2::ExtDetectLanguageSummary(bytes, numBytes, 179 | isPlainText != 0, 180 | &cldHints, 181 | flags, 182 | language3, 183 | percent3, 184 | normalized_score3, 185 | returnVectors != 0 ? &resultChunkVector : 0, 186 | &textBytesFound, 187 | &isReliable); 188 | Py_END_ALLOW_THREADS 189 | 190 | PyObject *details = PyTuple_New(3); 191 | for(int idx=0;idx<3;idx++) { 192 | CLD2::Language lang = language3[idx]; 193 | // Steals ref: 194 | PyTuple_SET_ITEM(details, idx, Py_BuildValue("(ssif)", 195 | CLD2::LanguageName(lang), 196 | CLD2::LanguageCode(lang), 197 | percent3[idx], 198 | normalized_score3[idx])); 199 | } 200 | 201 | PyObject *result; 202 | 203 | if (returnVectors != 0) { 204 | PyObject *resultChunks = PyTuple_New(resultChunkVector.size()); 205 | for(unsigned int i=0;i(chunk.lang1); 208 | // Steals ref: 209 | PyTuple_SET_ITEM(resultChunks, i, 210 | Py_BuildValue("(iiss)", 211 | chunk.offset, chunk.bytes, 212 | CLD2::LanguageName(lang), 213 | CLD2::LanguageCode(lang))); 214 | } 215 | result = Py_BuildValue("(OiOO)", 216 | isReliable ? Py_True : Py_False, 217 | textBytesFound, 218 | details, 219 | resultChunks); 220 | } else { 221 | result = Py_BuildValue("(OiO)", 222 | isReliable ? Py_True : Py_False, 223 | textBytesFound, 224 | details); 225 | } 226 | Py_DECREF(details); 227 | return result; 228 | } 229 | 230 | const char *DOC = 231 | "Detect language(s) from a UTF8 string.\n\n" 232 | 233 | "Arguments:\n\n" 234 | " utf8Bytes: text to detect, encoded as UTF-8 bytes (required)\n\n" 235 | 236 | " isPlainText: if False, then the input is HTML and CLD will skip HTML tags,\n" 237 | " expand HTML entities, detect HTML tags, etc.\n\n" 238 | 239 | " hintTopLevelDomain: e.g., 'id' boosts Indonesian\n\n" 240 | 241 | " hintLanguage: e.g., 'ITALIAN' or 'it' boosts Italian; see cld.LANGUAGES\n" 242 | " for all known language\n\n" 243 | 244 | " hintLanguageHTTPHeaders: e.g., 'mi,en' boosts Maori and English\n\n" 245 | 246 | " hintEncoding: e.g, 'SJS' boosts Japanese; see cld.ENCODINGS for all known\n" 247 | " encodings\n\n" 248 | 249 | " returnVectors: if True then the vectors indicating which language was\n" 250 | " detected in which byte range are returned in addition to\n" 251 | " details. The vectors are a sequence of (bytesOffset,\n" 252 | " bytesLength, languageName, languageCode), in order.\n" 253 | " bytesOffset is the start of the vector, bytesLength\n" 254 | " is the length of the vector. Note that there is some\n" 255 | " added CPU cost if this is True.\n\n" 256 | 257 | " debugScoreAsQuads: Normally, several languages are detected solely by their\n" 258 | " Unicode script. Combined with appropritate lookup tables,\n" 259 | " this flag forces them instead to be detected via quadgrams.\n" 260 | " This can be a useful refinement when looking for meaningful\n" 261 | " text in these languages, instead of just character sets.\n" 262 | " The default tables do not support this use.\n\n" 263 | 264 | " debugHTML: For each detection call, write an HTML file to stderr, showing the\n" 265 | " text chunks and their detected languages. See\n" 266 | " docs/InterpretingCLD2UnitTestOutput.pdf to interpret this output.\n\n" 267 | 268 | " debugCR: In that HTML file, force a new line for each chunk.\n\n" 269 | 270 | " debugVerbose: In that HTML file, show every lookup entry.\n\n" 271 | 272 | " debugQuiet: In that HTML file, suppress most of the output detail.\n\n" 273 | 274 | " debugEcho: Echo every input buffer to stderr.\n\n\n" 275 | 276 | "Returns:\n\n" 277 | " isReliable, textBytesFound, details when returnVectors is False\n" 278 | " isReliable, textBytesFound, details, vectors when returnVectors is True\n\n" 279 | 280 | " isReliable (boolean) is True if the detection is high confidence\n\n" 281 | 282 | " textBytesFound (int) is the total number of bytes of text detected\n\n" 283 | 284 | " details is a tuple of up to three detected languages, where each is\n" 285 | " tuple is (languageName, languageCode, percent, score). percent is\n" 286 | " what percentage of the original text was detected as this language\n" 287 | " and score is the confidence score for that language." 288 | ; 289 | 290 | static PyMethodDef CLDMethods[] = { 291 | {"detect", (PyCFunction) detect, METH_VARARGS | METH_KEYWORDS, DOC}, 292 | {0, 0} /* Sentinel */ 293 | }; 294 | 295 | #ifdef IS_PY3K 296 | 297 | static int cld_traverse(PyObject *m, visitproc visit, void *arg) { 298 | Py_VISIT(GETSTATE(m)->error); 299 | return 0; 300 | } 301 | 302 | static int cld_clear(PyObject *m) { 303 | Py_CLEAR(GETSTATE(m)->error); 304 | return 0; 305 | } 306 | 307 | static struct PyModuleDef moduledef = { 308 | PyModuleDef_HEAD_INIT, 309 | "cld", 310 | NULL, 311 | sizeof(struct PYCLDState), 312 | CLDMethods, 313 | NULL, 314 | cld_traverse, 315 | cld_clear, 316 | NULL 317 | }; 318 | 319 | #define INITERROR return NULL 320 | 321 | //PyObject * 322 | PyMODINIT_FUNC 323 | #ifdef CLD2_FULL 324 | PyInit_cld2full(void) 325 | #else 326 | PyInit_cld2(void) 327 | #endif 328 | 329 | #else // IS_PY3K 330 | 331 | #define INITERROR return 332 | 333 | PyMODINIT_FUNC 334 | #ifdef CLD2_FULL 335 | initcld2full() 336 | #else 337 | initcld2() 338 | #endif 339 | #endif 340 | { 341 | 342 | #ifdef IS_PY3K 343 | PyObject *m = PyModule_Create(&moduledef); 344 | #else 345 | #ifdef CLD2_FULL 346 | PyObject* m = Py_InitModule("cld2full", CLDMethods); 347 | #else 348 | PyObject* m = Py_InitModule("cld2", CLDMethods); 349 | #endif 350 | #endif 351 | 352 | if (m == NULL) { 353 | INITERROR; 354 | } 355 | 356 | struct PYCLDState *st = GETSTATE(m); 357 | 358 | st->error = PyErr_NewException((char *) "cld.error", NULL, NULL); 359 | if (st->error == NULL) { 360 | Py_DECREF(m); 361 | INITERROR; 362 | } 363 | 364 | // Set module-global ENCODINGS tuple: 365 | PyObject* pyEncs = PyTuple_New(CLD2::NUM_ENCODINGS-1); 366 | // Steals ref: 367 | PyModule_AddObject(m, "ENCODINGS", pyEncs); 368 | unsigned int upto = 0; 369 | for(int encIDX=0;encIDX(encIDX) != CLD2::UNKNOWN_ENCODING) { 371 | if (upto == PyTuple_Size(pyEncs)) { 372 | PyErr_SetString(st->error, "failed to initialize cld.ENCODINGS"); 373 | INITERROR; 374 | } 375 | PyTuple_SET_ITEM(pyEncs, upto++, PyUnicode_FromString(cld_encoding_info[encIDX].name)); 376 | } 377 | } 378 | 379 | if (upto != PyTuple_Size(pyEncs)) { 380 | PyErr_SetString(st->error, "failed to initialize cld.ENCODINGS"); 381 | INITERROR; 382 | } 383 | 384 | // Set module-global LANGUAGES tuple: 385 | PyObject* pyLangs = PyTuple_New(CLD2::kNameToLanguageSize-1); 386 | // Steals ref: 387 | PyModule_AddObject(m, "LANGUAGES", pyLangs); 388 | upto = 0; 389 | for(int i=0;ierror, "failed to initialize cld.LANGUAGES"); 394 | INITERROR; 395 | } 396 | CLD2::Language lang = CLD2::GetLanguageFromName(name); 397 | if (lang == CLD2::UNKNOWN_LANGUAGE) { 398 | PyErr_SetString(st->error, "failed to initialize cld.LANGUAGES"); 399 | INITERROR; 400 | } 401 | PyTuple_SET_ITEM(pyLangs, 402 | upto++, 403 | Py_BuildValue("(zz)", 404 | LanguageName(lang), 405 | LanguageCode(lang))); 406 | } 407 | } 408 | 409 | if (upto != PyTuple_Size(pyLangs)) { 410 | PyErr_SetString(st->error, "failed to initialize cld.LANGUAGES"); 411 | INITERROR; 412 | } 413 | 414 | // Steals ref: 415 | #ifdef IS_PY3K 416 | PyModule_AddObject(m, "VERSION", PyUnicode_FromString(CLD2::DetectLanguageVersion())); 417 | #else 418 | PyModule_AddObject(m, "VERSION", PyString_FromString(CLD2::DetectLanguageVersion())); 419 | #endif 420 | 421 | // Set module-global DETECTED_LANGUAGES tuple: 422 | 423 | upto = 0; 424 | 425 | #ifdef CLD2_FULL 426 | PyObject* detLangs = PyTuple_New(165); 427 | 428 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ABKHAZIAN")); 429 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AFAR")); 430 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AFRIKAANS")); 431 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AKAN")); 432 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ALBANIAN")); 433 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AMHARIC")); 434 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ARABIC")); 435 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ARMENIAN")); 436 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ASSAMESE")); 437 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AYMARA")); 438 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AZERBAIJANI")); 439 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BASHKIR")); 440 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BASQUE")); 441 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BELARUSIAN")); 442 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BENGALI")); 443 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BIHARI")); 444 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BISLAMA")); 445 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BOSNIAN")); 446 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BRETON")); 447 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BULGARIAN")); 448 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BURMESE")); 449 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CATALAN")); 450 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CEBUANO")); 451 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CHEROKEE")); 452 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CORSICAN")); 453 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CROATIAN")); 454 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CZECH")); 455 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("Chinese")); 456 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ChineseT")); 457 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DANISH")); 458 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DHIVEHI")); 459 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DUTCH")); 460 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DZONGKHA")); 461 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ENGLISH")); 462 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ESPERANTO")); 463 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ESTONIAN")); 464 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FAROESE")); 465 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FIJIAN")); 466 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FINNISH")); 467 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FRENCH")); 468 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FRISIAN")); 469 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GALICIAN")); 470 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GANDA")); 471 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GEORGIAN")); 472 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GERMAN")); 473 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GREEK")); 474 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GREENLANDIC")); 475 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GUARANI")); 476 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GUJARATI")); 477 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HAITIAN_CREOLE")); 478 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HAUSA")); 479 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HAWAIIAN")); 480 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HEBREW")); 481 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HINDI")); 482 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HMONG")); 483 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HUNGARIAN")); 484 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ICELANDIC")); 485 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("IGBO")); 486 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INDONESIAN")); 487 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INTERLINGUA")); 488 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INTERLINGUE")); 489 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INUKTITUT")); 490 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INUPIAK")); 491 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("IRISH")); 492 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ITALIAN")); 493 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("JAVANESE")); 494 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("Japanese")); 495 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KANNADA")); 496 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KASHMIRI")); 497 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KAZAKH")); 498 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KHASI")); 499 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KHMER")); 500 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KINYARWANDA")); 501 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KURDISH")); 502 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KYRGYZ")); 503 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("Korean")); 504 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LAOTHIAN")); 505 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LATIN")); 506 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LATVIAN")); 507 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LIMBU")); 508 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LINGALA")); 509 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LITHUANIAN")); 510 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LUXEMBOURGISH")); 511 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MACEDONIAN")); 512 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALAGASY")); 513 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALAY")); 514 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALAYALAM")); 515 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALTESE")); 516 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MANX")); 517 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MAORI")); 518 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MARATHI")); 519 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MAURITIAN_CREOLE")); 520 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MONGOLIAN")); 521 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NAURU")); 522 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NDEBELE")); 523 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NEPALI")); 524 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NORWEGIAN")); 525 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NORWEGIAN_N")); 526 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NYANJA")); 527 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("OCCITAN")); 528 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ORIYA")); 529 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("OROMO")); 530 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PASHTO")); 531 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PEDI")); 532 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PERSIAN")); 533 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("POLISH")); 534 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PORTUGUESE")); 535 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PUNJABI")); 536 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("QUECHUA")); 537 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("RHAETO_ROMANCE")); 538 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ROMANIAN")); 539 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("RUNDI")); 540 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("RUSSIAN")); 541 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SAMOAN")); 542 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SANGO")); 543 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SANSKRIT")); 544 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SCOTS")); 545 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SCOTS_GAELIC")); 546 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SERBIAN")); 547 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SESELWA")); 548 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SESOTHO")); 549 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SHONA")); 550 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SINDHI")); 551 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SINHALESE")); 552 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SISWANT")); 553 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SLOVAK")); 554 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SLOVENIAN")); 555 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SOMALI")); 556 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SPANISH")); 557 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SUNDANESE")); 558 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SWAHILI")); 559 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SWEDISH")); 560 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SYRIAC")); 561 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TAGALOG")); 562 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TAJIK")); 563 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TAMIL")); 564 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TATAR")); 565 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TELUGU")); 566 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("THAI")); 567 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TIBETAN")); 568 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TIGRINYA")); 569 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TONGA")); 570 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TSONGA")); 571 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TSWANA")); 572 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TURKISH")); 573 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TURKMEN")); 574 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("UIGHUR")); 575 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("UKRAINIAN")); 576 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("URDU")); 577 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("UZBEK")); 578 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("VENDA")); 579 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("VIETNAMESE")); 580 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("VOLAPUK")); 581 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("WARAY_PHILIPPINES")); 582 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("WELSH")); 583 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("WOLOF")); 584 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("XHOSA")); 585 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("X_Buginese")); 586 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("X_Gothic")); 587 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("X_KLINGON")); 588 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("X_PIG_LATIN")); 589 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("YIDDISH")); 590 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("YORUBA")); 591 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ZHUANG")); 592 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ZULU")); 593 | #else 594 | PyObject* detLangs = PyTuple_New(89); 595 | 596 | // List originally sent by Dick Sites on 7/17/2013, then I 597 | // added 6 new languages from the Jan 2014 release: 598 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AFRIKAANS")); 599 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ALBANIAN")); 600 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ARABIC")); 601 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ARMENIAN")); 602 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("AZERBAIJANI")); 603 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BASQUE")); 604 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BELARUSIAN")); 605 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BENGALI")); 606 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BIHARI")); 607 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BOSNIAN")); 608 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("BULGARIAN")); 609 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CATALAN")); 610 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CEBUANO")); 611 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CHEROKEE")); 612 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CROATIAN")); 613 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("CZECH")); 614 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("Chinese")); 615 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ChineseT")); 616 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DANISH")); 617 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DHIVEHI")); 618 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("DUTCH")); 619 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ENGLISH")); 620 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ESTONIAN")); 621 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FINNISH")); 622 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("FRENCH")); 623 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GALICIAN")); 624 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GANDA")); 625 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GEORGIAN")); 626 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GERMAN")); 627 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GREEK")); 628 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("GUJARATI")); 629 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HAITIAN_CREOLE")); 630 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HAUSA")); 631 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HEBREW")); 632 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HINDI")); 633 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HMONG")); 634 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("HUNGARIAN")); 635 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ICELANDIC")); 636 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("IGBO")); 637 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INDONESIAN")); 638 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("INUKTITUT")); 639 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("IRISH")); 640 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ITALIAN")); 641 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("JAVANESE")); 642 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("Japanese")); 643 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KANNADA")); 644 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KHMER")); 645 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("KINYARWANDA")); 646 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("Korean")); 647 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LAOTHIAN")); 648 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LATVIAN")); 649 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LIMBU")); 650 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("LITHUANIAN")); 651 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MACEDONIAN")); 652 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALAY")); 653 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALAYALAM")); 654 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MALTESE")); 655 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("MARATHI")); 656 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NEPALI")); 657 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("NORWEGIAN")); 658 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ORIYA")); 659 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PERSIAN")); 660 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("POLISH")); 661 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PORTUGUESE")); 662 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("PUNJABI")); 663 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ROMANIAN")); 664 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("RUSSIAN")); 665 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SCOTS_GAELIC")); 666 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SERBIAN")); 667 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SINHALESE")); 668 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SLOVAK")); 669 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SLOVENIAN")); 670 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SOMALI")); 671 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SPANISH")); 672 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SWAHILI")); 673 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SWEDISH")); 674 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("SYRIAC")); 675 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TAGALOG")); 676 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TAMIL")); 677 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TELUGU")); 678 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("THAI")); 679 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("TURKISH")); 680 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("UKRAINIAN")); 681 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("URDU")); 682 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("VIETNAMESE")); 683 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("WELSH")); 684 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("YIDDISH")); 685 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("YORUBA")); 686 | PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("ZULU")); 687 | #endif 688 | 689 | // Steals ref: 690 | PyModule_AddObject(m, "DETECTED_LANGUAGES", detLangs); 691 | 692 | if (upto != PyTuple_Size(detLangs)) { 693 | PyErr_SetString(st->error, "failed to initialize cld.DETECTED_LANGUAGES"); 694 | INITERROR; 695 | } 696 | 697 | // Steals ref: 698 | PyModule_AddObject(m, "error", st->error); 699 | #ifdef IS_PY3K 700 | return m; 701 | #endif 702 | } 703 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from distutils.core import setup, Extension 18 | import distutils.core 19 | import platform 20 | import subprocess 21 | import sys 22 | import os 23 | 24 | # NOTE: change this to point to where you checked out the CLD2 25 | # sources, or define the CLD2_PATH environment variable 26 | CLD2_PATH = os.environ.get('CLD2_PATH', '/usr/include/cld2') 27 | 28 | # Test suite 29 | class cldtest(distutils.core.Command): 30 | # user_options, initialize_options and finalize_options must be overriden. 31 | user_options = [] 32 | def initialize_options(self): 33 | pass 34 | def finalize_options(self): 35 | pass 36 | 37 | def run(self): 38 | errno = subprocess.call([sys.executable, 'tests/cld_test.py']) 39 | raise SystemExit(errno) 40 | 41 | module = Extension('cld2', 42 | language='c++', 43 | include_dirs = ['%s/public' % CLD2_PATH, '%s/internal' % CLD2_PATH], 44 | libraries = ['cld2'], 45 | sources=['pycldmodule.cc', 'encodings.cc'], 46 | ) 47 | 48 | setup(name='python-cld2', 49 | version='2.0', 50 | author='Michael McCandless', 51 | author_email='mail@mikemccandless.com', 52 | description='Python bindings around Google Chromium\'s embedded compact language detection library (CLD2)', 53 | ext_modules = [module], 54 | requires=['cld2'], 55 | license = 'Apache2', 56 | url = 'https://github.com/scrapinghub/python-cld2/', 57 | classifiers = [ 58 | 'License :: OSI Approved :: BSD License', 59 | 'Operating System :: MacOS :: MacOS X', 60 | 'Operating System :: Microsoft :: Windows', 61 | 'Operating System :: POSIX :: Linux', 62 | 'Programming Language :: C++', 63 | 'Programming Language :: Python', 64 | 'Development Status :: 4 - Beta', 65 | 'Intended Audience :: Developers', 66 | 'Topic :: Text Processing :: Linguistic' 67 | ], 68 | ) 69 | -------------------------------------------------------------------------------- /setup_full.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from distutils.core import setup, Extension 18 | import distutils.core 19 | import platform 20 | import subprocess 21 | import sys 22 | import os 23 | 24 | # NOTE: change this to point to where you checked out the CLD2 25 | # sources, or define the CLD2_PATH environment variable 26 | CLD2_PATH = os.environ.get('CLD2_PATH', '/usr/include/cld2') 27 | 28 | # Test suite 29 | class cldtest(distutils.core.Command): 30 | # user_options, initialize_options and finalize_options must be overriden. 31 | user_options = [] 32 | def initialize_options(self): 33 | pass 34 | def finalize_options(self): 35 | pass 36 | 37 | def run(self): 38 | errno = subprocess.call([sys.executable, 'tests/cld_test.py']) 39 | raise SystemExit(errno) 40 | 41 | module = Extension('cld2full', 42 | language='c++', 43 | extra_compile_args = ['-DCLD2_FULL'], 44 | include_dirs = ['%s/public' % CLD2_PATH, '%s/internal' % CLD2_PATH], 45 | libraries = ['cld2_full'], 46 | sources=['pycldmodule.cc', 'encodings.cc'], 47 | libdirs = ['./build'], 48 | ) 49 | 50 | setup(name='python-cld2', 51 | version='2.0', 52 | author='Michael McCandless', 53 | author_email='mail@mikemccandless.com', 54 | description='Python bindings around Google Chromium\'s embedded compact language detection library (CLD2)', 55 | ext_modules = [module], 56 | requires=['cld2'], 57 | license = 'Apache2', 58 | url = 'https://github.com/scrapinghub/python-cld2/', 59 | classifiers = [ 60 | 'License :: OSI Approved :: BSD License', 61 | 'Operating System :: MacOS :: MacOS X', 62 | 'Operating System :: Microsoft :: Windows', 63 | 'Operating System :: POSIX :: Linux', 64 | 'Programming Language :: C++', 65 | 'Programming Language :: Python', 66 | 'Development Status :: 4 - Beta', 67 | 'Intended Audience :: Developers', 68 | 'Topic :: Text Processing :: Linguistic' 69 | ], 70 | ) 71 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import sys 19 | import stat 20 | import unittest 21 | import traceback 22 | 23 | # Get just major.minor version of currently running Python, ie 3.2.3 24 | # -> 3.2: 25 | version = sys.version.split()[0] 26 | version = version[:version.rfind('.')] 27 | 28 | moduleDir = "." 29 | sys.path.insert(0, moduleDir) 30 | 31 | import cld2full 32 | import cld2 33 | 34 | VERBOSE = False 35 | 36 | fr_en_Latn = 'France is the largest country in Western Europe and the third-largest in Europe as a whole. A accès aux chiens et aux frontaux qui lui ont été il peut consulter et modifier ses collections et exporter Cet article concerne le pays européen aujourd’hui appelé République française. Pour d’autres usages du nom France, Pour une aide rapide et effective, veuiller trouver votre aide dans le menu ci-dessus. Motoring events began soon after the construction of the first successful gasoline-fueled automobiles. The quick brown fox jumped over the lazy dog' 37 | 38 | testData = ( 39 | ('ENGLISH', 'confiscation of goods is assigned as the penalty part most of the courts consist of members and when it is necessary to bring public cases before a jury of members two courts combine for the purpose the most important cases of all are brought jurors or'), 40 | ('ARMENIAN', ' ա յ եվ նա հիացած աչքերով նայում է հինգհարկանի շենքի տարօրինակ փոքրիկ քառակուսի պատուհաններին դեռ մենք շատ ենք հետամնաց ասում է նա այսպես է'), 41 | ('CHEROKEE', 'ᎠᎢᏍᎩ ᎠᏟᎶᏍᏗ ᏥᏄᏍᏛᎩ ᎦᎫᏍᏛᏅᎯ ᎾᎥᎢ'), 42 | ('DHIVEHI', ' ހިންދީ ބަހުން ވާހަކަ ދައްކާއިރު ދެވަނަ ބަހެއްގެ ގޮތުގައާއި އެނޫން ގޮތްގޮތުން ހިންދީ ބަހުން ވާހަކަ ދައްކާ މީހުންގެ އަދަދު މިލިއަނަށް'), 43 | ('GEORGIAN', ' ა ბირთვიდან მიღებული ელემენტი მენდელეევის პერიოდულ სიტემაში გადაინაცვლებს ორი უჯრით'), 44 | ('GREEK', ' ή αρνητική αναζήτηση λέξης κλειδιού καταστήστε τις μεμονωμένες λέξεις κλειδιά περισσότερο στοχοθετημένες με τη μετατροπή τους σε'), 45 | ('GUJARATI', ' આના પરિણામ પ્રમાણસર ફોન્ટ અવતરણ ચિન્હવાળા પાઠને છુપાવો બધા સમૂહો શોધાયા હાલનો જ સંદેશ વિષયની'), 46 | ('INUKTITUT', 'ᐃᑯᒪᒻᒪᑦ ᕿᓈᖏᓐᓇᓲᖑᒻᒪᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ ᑎᑦᕆᐊᑐᓐᖏᑦᑕᑎᑦ ᑎᑎᖅᑕᑉᐱᑦ ᓯᕗᓂᖓᓂ ᑎᑎᖅᖃᖅ ᑎᑎᕆᐊᑐᓐᖏᑕᐃᑦ ᕿᓂᓲᖑᔪᒍᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ'), 47 | ('KANNADA', ' ಂಠಯ್ಯನವರು ತುಮಕೂರು ಜಿಲ್ಲೆಯ ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲ್ಲೂಕಿನ ತೀರ್ಥಪುರ ವೆಂಬ ಸಾಧಾರಣ ಹಳ್ಳಿಯ ಶ್ಯಾನುಭೋಗರ'), 48 | ('KHMER', ' ក ខ គ ឃ ង ច ឆ ជ ឈ ញ ដ ឋ ឌ ឍ ណ ត ថ ទ ធ ន ប ផ ព ភ ម យ រ ល វ ស ហ ឡ អ ឥ ឦ ឧ ឪ ឫ ឬ ឯ ឱ ទាំងអស់'), 49 | ('LAOTHIAN', ' ກຫາທົ່ວທັງເວັບ ແລະໃນເວັບໄຮ້ສາຍ ທຳອິດໃຫ້ທຳການຊອກຫາກ່ອນ ຈາກນັ້ນ ໃຫ້ກົດປຸ່ມເມນູ ໃນໜ້າຜົນໄດ້'), 50 | ('LIMBU', 'ᤁᤡᤖᤠᤳ ᤕᤠᤰᤌᤢᤱ ᤆᤢᤶᤗᤢᤱᤖᤧ ᤛᤥᤎᤢᤱᤃᤧᤴ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤆᤧᤶᤈᤱᤗᤧ ᤁᤢᤔᤡᤱᤅᤥ ᤏᤠᤈᤡᤖᤡ ᤋᤱᤒᤣ ᥈᥆᥆᥉ ᤒᤠ ᤈᤏᤘᤖᤡ ᤗᤠᤏᤢᤀᤠᤱ ᤁ᤹ᤏᤠ ᤋᤱᤒᤣ ᤁᤠᤰ ᤏᤠ᤺ᤳᤋᤢ ᤕᤢᤖᤢᤒᤠ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤋᤱᤃᤡᤵᤛᤡᤱ ᤌᤡᤶᤒᤣᤴ ᤂᤠᤃᤴ ᤛᤡᤛᤣ᤺ᤰᤗᤠ ᥇᥍ ᤂᤧᤴ ᤀᤡᤛᤡᤰ ᥇ ᤈᤏᤘᤖᤡ ᥈᥆᥆᥊ ᤀᤥ ᤏᤠᤛᤢᤵ ᤆᤥ᤺ᤰᤔᤠ ᤌᤡᤶᤒᤣ ᤋᤱᤃᤠᤶᤛᤡᤱᤗ ᤐᤳᤐᤠ ᤀᤡᤱᤄᤱ ᤘᤠ᤹'), 51 | ('MALAYALAM', ' ം അങ്ങനെ ഞങ്ങള് അവരുടെ മുമ്പില് നിന്നു ഔടും ഉടനെ നിങ്ങള് പതിയിരിപ്പില് നിന്നു എഴുന്നേറ്റു'), 52 | ('ORIYA', 'ଅକ୍ଟୋବର ଡିସେମ୍ବର'), 53 | ('PUNJABI', ' ਂ ਦਿਨਾਂ ਵਿਚ ਭਾਈ ਸਾਹਿਬ ਦੀ ਬੁੱਚੜ ਗੋਬਿੰਦ ਰਾਮ ਨਾਲ ਅੜਫਸ ਚੱਲ ਰਹੀ ਸੀ ਗੋਬਿੰਦ ਰਾਮ ਨੇ ਭਾਈ ਸਾਹਿਬ ਦੀਆਂ ਭੈਣਾ'), 54 | ('SINHALESE', ' අනුරාධ මිහිඳුකුල නමින් සකුරා ට ලිපියක් තැපෑලෙන් එවා තිබුණා කි ් රස්ටි ෂෙල්ටන් ප ් රනාන්දු ද'), 55 | ('SYRIAC', 'ܐܕܪܝܣ ܓܛܘ ܫܘܪܝܐ ܡܢ ܦܪܢܣܐ ܡܢ ܐܣܦܢܝܐ ܚܐܪܘܬܐ ܒܐܕܪ ܒܢܝܣܢ ܫܛܝܚܘܬܐ ܟܠܢܝܐ ܡܝ̈ܐ ܒܥܠܡܐ'), 56 | ('TAGALOG', ' ᜋᜇ᜔ ᜐᜓᜎᜆ᜔ ᜃ ᜈᜅ᜔ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜂᜉᜅ᜔᜔ ᜋᜐᜈᜌ᜔ ᜎᜅ᜔ ᜁᜐ ᜉᜅ᜔ ᜀᜃ᜔ᜎᜆ᜔ ᜆᜓᜅ᜔ᜃᜓᜎ᜔ ᜐ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜐ ᜆᜒᜅᜒᜈ᜔ ᜃᜓ'), 57 | ('TAMIL', ' அங்கு ராஜேந்திர சோழனால் கட்டப்பட்ட பிரம்மாண்டமான சிவன் கோவில் ஒன்றும் உள்ளது தொகு'), 58 | ('TELUGU', ' ఁ దనర జయించిన తత్వ మరసి చూడఁ దాన యగును రాజయోగి యిట్లు తేజరిల్లుచు నుండు విశ్వదాభిరామ వినర వేమ'), 59 | ('THAI', ' กฏในการค้นหา หรือหน้าเนื้อหา หากท่านเลือกลงโฆษณา ท่านอาจจะปรับต้องเพิ่มงบประมาณรายวันตา'), 60 | ('Chinese', '产品的简报和公告 提交该申请后无法进行更改 请确认您的选择是正确的 对于要提交的图书 我确认 我是版权所有者或已得到版权所有者的授权 要更改您的国家 地区 请在此表的最上端更改您的'), 61 | ('ChineseT', ' 之前為 帳單交易作業區 已變更 廣告內容 之前為 銷售代表 之前為 張貼日期為 百分比之前為 合約 為 目標對象條件已刪除 結束日期之前為'), 62 | ('Japanese', ' このペ ジでは アカウントに指定された予算の履歴を一覧にしています それぞれの項目には 予算額と特定期間のステ タスが表示されます 現在または今後の予算を設定するには'), 63 | ('Korean', ' 개별적으로 리포트 액세스 권한을 부여할 수 있습니다 액세스 권한 부여사용자에게 프로필 리포트에 액세스할 수 있는 권한을 부여하시려면 가용 프로필 상자에서 프로필 이름을 선택한 다음'), 64 | ('AFRIKAANS', ' aam skukuza die naam beteken hy wat skoonvee of hy wat alles onderstebo keer wysig bosveldkampe boskampe is kleiner afgeleë ruskampe wat oor min fasiliteite beskik daar is geen restaurante of winkels nie en slegs oornagbesoekers word toegelaat bateleur'), 65 | ('ALBANIAN', ' a do të kërkoni nga beogradi që të njohë pavarësinë e kosovës zoti thaçi prishtina është gati ta njoh pavarësinë e serbisë ndërsa natyrisht se do të kërkohet një gjë e tillë që edhe beogradi ta njoh shtetin e pavarur dhe sovran të'), 66 | ('ARABIC', 'احتيالية بيع أي حساب'), 67 | ('AZERBAIJANI', ' a az qalıb breyn rinq intellektual oyunu üzrə yarışın zona mərhələləri keçirilib miq un qalıqlarının dənizdən çıxarılması davam edir məhəmməd peyğəmbərin karikaturalarını çap edən qəzetin baş redaktoru iş otağında ölüb'), 68 | ('BASQUE', ' a den eraso bat honen kontra hortaz eragiketa bakarrik behar dituen eraso batek aes apurtuko luke nahiz eta oraingoz eraso bideraezina izan gaur egungo teknologiaren mugak direla eta oraingoz kezka hauek alde batera utzi daitezke orain arteko indar'), 69 | ('BELARUSIAN', ' а друкаваць іх не было тэхнічна магчыма бліжэй за вільню тым самым часам нямецкае кіраўніцтва прапаноўвала апроч ўвядзення лацінкі яе'), 70 | ('BENGALI', 'গ্যালারির ৩৮ বছর পূর্তিতে মূল্যছাড় অর্থনীতি বিএনপির ওয়াক আউট তপন চৌধুরী হারবাল অ্যাসোসিয়েশনের সভাপতি আন্তর্জাতিক পরামর্শক বোর্ড দিয়ে শরিয়াহ্ ইনন্ডেক্স করবে সিএসই মালিকপক্ষের কান্না, শ্রমিকের অনিশ্চয়তা মতিঝিলে সমাবেশ নিষিদ্ধ: এফবিসিসিআইয়ের ধন্যবাদ বিনোদন বিশেষ প্রতিবেদন বাংলালিংকের গ্র্যান্ডমাস্টার সিজন-৩ ব্রাজিলে বিশ্বকাপ ফুটবল আয়োজনবিরোধী বিক্ষোভ দেশের নিরাপত্তার চেয়ে অনেক বেশি সচেতন । প্রার্থীদের দক্ষতা ও যোগ্যতার পাশাপাশি তারা জাতীয় ইস্যুগুলোতে প্রাধান্য দিয়েছেন । ” পাঁচটি সিটিতে ২০ লাখ ভোটারদের দিয়ে জাতীয় নির্বাচনে ৮ কোটি ভোটারদের সঙ্গে তুলনা করা যাবে কি একজন দর্শকের এমন প্রশ্নে জবাবে আব্দুল্লাহ আল নোমান বলেন , “ এই পাঁচটি সিটি কর্পোরেশন নির্বাচন দেশের পাঁচটি বড় বিভাগের প্রতিনিধিত্ব করছে । এছাড়া এখানকার ভোটার রা সবাই সচেতন । তারা'), 71 | ('BIHARI', 'काल में उनका हमला से बचे खाती एहिजा भाग के अइले आ भोजपुर नाम से नगर बसवले. एकरा बारे में विस्तार से जानकारी नीचे दीहल गइल बा. बाकिर आश्चर्यजनक रूप से मालवा के राजा भोज के बिहार आवे आ भोजपुर नगर बसावे आ चाहे भोजपुरी के साथे उनकर कवनो संबंध होखे के कवनो जानकारी भोपाल के भोज संस्थान आ चाहे मध्य प्रदेश के इतिहासकार लोगन के तनिको नइखे. हालांकि ऊ सब लोग एह बात के मानत बा कि एकरा बारे में अबहीं तकले मूर्ति बनवइलें. राजा भोज के जवना जगहा पऽ वाग्देवी के दर्शन भइल रहे, ओही स्थान पऽ एह मूर्ति के स्थापना कइल गइल. अब अगर एह मंदिर के एह शिलालेख के तस्वीर (पृष्ठ संख्या 33 पऽ प्रकाशित) रउआ धेयान से देखीं तऽ एकरा पऽ कैथी लिपि में -सीताराम- लिखल साफ लउकत बा. कैथी भोजपुरी के बहुत प्रचलित लिपि रहल बिया. एकरा बारे में कवनो शंका संदेह बिहार-यूपी के जानकार लोगन में नइखे. एल. एस. एस. वो माले के लिखल पढ़ीं '), 72 | ('BULGARIAN', ' а дума попада в състояние на изпитание ключовите думи с предсказана малко под то изискване на страниците за търсене в'), 73 | ('CATALAN', 'al final en un únic lloc nhorabona l correu electrònic està concebut com a eina de productivitat aleshores per què perdre el temps arxivant missatges per després intentar recordar on els veu desar i per què heu d eliminar missatges importants per l'), 74 | ('CEBUANO', 'Ang Sugbo usa sa mga labing ugmad nga lalawigan sa nasod. Kini ang sentro sa komersyo, edukasyon ug industriya sa sentral ug habagatang dapit sa kapupod-an. Ang mipadayag sa Sugbo isip ikapito nga labing nindot nga pulo sa , ang nag-inusarang pulo sa Pilipinas nga napasidunggan sa maong magasin sukad pa sa tuig'), 75 | ('CROATIAN', 'Posljednja dva vladara su Kijaksar (Κυαξαρης; 625-585 prije Krista), fraortov sin koji će proširiti teritorij Medije i Astijag. Kijaksar je imao kćer ili unuku koja se zvala Amitis a postala je ženom Nabukodonosora II. kojoj je ovaj izgradio Viseće vrtove Babilona. Kijaksar je modernizirao svoju vojsku i uništio Ninivu 612. prije Krista. Naslijedio ga je njegov sin, posljednji medijski kralj, Astijag, kojega je detronizirao (srušio sa vlasti) njegov unuk Kir Veliki. Zemljom su zavladali Perzijanci.'), 76 | ('CZECH', ' a akci opakujte film uložen vykreslit gmail tokio smazat obsah adresáře nelze načíst systémový profil jednotky smoot okud používáte pro určení polokoule značky z západ nebo v východ používejte nezáporné hodnoty zeměpisné délky nelze'), 77 | ('DANISH', ' a z tallene og punktummer der er tilladte log ud angiv den ønskede adgangskode igen november gem personlige oplysninger kontrolspørgsmål det sidste tegn i dit brugernavn skal være et bogstav a z eller tal skriv de tegn du kan se i billedet nedenfor'), 78 | ('DUTCH', ' a als volgt te werk om een configuratiebestand te maken sitemap gen py ebruik filters om de s op te geven die moeten worden toegevoegd of uitgesloten op basis van de opmaaktaal elke sitemap mag alleen de s bevatten voor een bepaalde opmaaktaal dit'), 79 | ('ENGLISH', ' a backup credit card by visiting your billing preferences page or visit the adwords help centre for more details https adwords google com support bin answer py answer hl en we were unable to process the payment of for your outstanding google adwords'), 80 | ('ESTONIAN', ' a niipea kui sinu maksimaalne igakuine krediidi limiit on meie poolt heaks kiidetud on sinu kohustuseks see krediidilimiit'), 81 | ('FINNISH', ' a joilla olet käynyt tämä kerro meille kuka ä olet ei tunnistettavia käyttötietoja kuten virheraportteja käytetään google desktopin parantamiseen etsi näyttää mukautettuja uutisia google desktop keskivaihto leikkaa voit kaksoisnapsauttaa'), 82 | ('FRENCH', ' a accès aux collections et aux frontaux qui lui ont été attribués il peut consulter et modifier ses collections et exporter des configurations de collection toutefois il ne peut pas créer ni supprimer des collections enfin il a accès aux fonctions'), 83 | ('GALICIAN', ' debe ser como mínimo taranto tendas de venda polo miúdo cociñas servizos bordado canadá viaxes parques de vehículos de recreo hotel oriental habitación recibir unha postal no enderezo indicado anteriormente'), 84 | ('GANDA', ' abaana ba bani lukaaga mu ana mu babiri abaana ba bebayi lukaaga mu abiri mu basatu abaana ba azugaadi lukumi mu ebikumi bibiri mu abiri mu babiri abaana ba adonikamu lukaaga mu nltaaga mu mukaaga abaana ba biguvaayi enkumi bbiri mu ataano mu mukaaga'), 85 | ('GERMAN', ' abschnitt ordner aktivieren werden die ordnereinstellungen im farbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben optional n diesem schritt geben sie für jedesfeld aus dem datenset den typ an ieser schritt ist optional eldtypen'), 86 | ('HAITIAN_CREOLE', ' ak pitit tout sosyete a chita se pou sa leta dwe pwoteje yo nimewo leta fèt pou li pwoteje tout paran ak pitit nan peyi a menm jan kit paran yo marye kit yo pa marye tout manman ki fè pitit leta fèt pou ba yo konkoul menm jan tou pou timoun piti ak pou'), 87 | ('HEBREW', ' או לערוך את העדפות ההפצה אנא עקוב אחרי השלבים הבאים כנס לחשבון האישי שלך ב'), 88 | ('HINDI', ' ं ऐडवर्ड्स विज्ञापनों के अनुभव पर आधारित हैं और इनकी मदद से आपको अपने विज्ञापनों का अधिकतम लाभ'), 89 | ('HMONG', ' Kuv hlub koj txawm lub ntuj yuav si ntshi nphaus los kuv tsis ua siab nkaug txawm ntiab teb yuav si ntshi nphaus los kuv tseem ua lon tsaug vim kuv hlub koj tag lub siab'), 90 | ('HUNGARIAN', ' a felhasználóim a google azonosító szöveget ikor látják a felhasználóim a google azonosító szöveget felhasználók a google azonosító szöveget fogják látni minden tranzakció után ha a vásárlását regisztrációját oldalunk'), 91 | ('ICELANDIC', ' a afköst leitarorða þinna leitarorð neikvæð leitarorð auglýsingahópa byggja upp aðallista yfir ný leitarorð fyrir auglýsingahópana og skoða ítarleg gögn um árangur leitarorða eins og samkeppni auglýsenda og leitarmagn er krafist notkun'), 92 | ('INDONESIAN', 'berdiri setelah pengurusnya yang berusia 83 tahun, Fayzrahman Satarov, mendeklarasikan diri sebagai nabi dan rumahnya sebagai negara Islam Satarov digambarkan sebagai mantan ulama Islam tahun 1970-an. Pengikutnya didorong membaca manuskripnya dan kebanyakan dilarang meninggalkan tempat persembunyian bawah tanah di dasar gedung delapan lantai mereka. Jaksa membuka penyelidikan kasus kriminal pada kelompok itu dan menyatakan akan membubarkan kelompok kalau tetap melakukan kegiatan ilegal seperti mencegah anggotanya mencari bantuan medis atau pendidikan. Sampai sekarang pihak berwajib belum melakukan penangkapan meskipun polisi mencurigai adanya tindak kekerasan pada anak. Pengadilan selanjutnya akan memutuskan apakah anak-anak diizinkan tetap tinggal dengan orang tua mereka. Kazan yang berada sekitar 800 kilometer di timur Moskow merupakan wilayah Tatarstan yang'), 93 | ('IRISH', ' a bhfuil na focail go léir i do cheist le fáil orthu ní gá ach focail breise a chur leis na cinn a cuardaíodh cheana chun an cuardach a bheachtú nó a chúngú má chuirtear focal breise isteach aimseofar fo aicme ar leith de na torthaí a fuarthas'), 94 | ('ITALIAN', ' a causa di un intervento di manutenzione del sistema fino alle ore circa ora legale costa del pacifico del novembre le campagne esistenti continueranno a essere pubblicate come di consueto anche durante questo breve periodo di inattività ci scusiamo per'), 95 | ('JAVANESE', ' account ten server niki kalian username meniko tanpo judul cacahe account nggonanmu wes pol pesen mu wes diguwak pesenan mu wes di simpen sante wae pesenan mu wes ke kirim mbuh tekan ora pesenan e ke kethok pesenan mu wes ke kirim mbuh tekan ora pesenan'), 96 | ('KINYARWANDA', ' dore ibyo ukeneye kumenya ukwo watubona ibibazo byinshi abandi babaza ububonero byibibina google onjela ho izina dyikyibina kyawe onjela ho yawe mulugo kulaho ibyandiko byawe shyilaho tegula yawe tulubaka tukongeraho iyanya mishya buliko tulambula'), 97 | ('LATVIAN', ' a gadskārtējā izpārdošana slēpošana jāņi atlaide izmaiņas trafikā kas saistītas ar sezonas izpārdošanu speciālajām atlaidēm u c ir parastas un atslēgvārdi kas ir populāri noteiktos laika posmos šajā laikā saņems lielāku klikšķu'), 98 | ('LITHUANIAN', ' a išsijungia mano idėja dėl geriausio laiko po pastarųjų savo santykių pasimokiau penki dalykai be kurių negaliu gyventi mano miegamajame tu surasi ideali pora išsilavinimas aukštoji mokykla koledžas universitetas pagrindinis laipsnis metai'), 99 | ('MACEDONIAN', ' гласовите коалицијата на вмро дпмне како партија со најмногу освоени гласови ќе добие евра а на сметката на коализијата за македонија'), 100 | ('MALAY', 'pengampunan beramai-ramai supaya mereka pulang ke rumah masing-masing. Orang-orang besarnya enggan mengiktiraf sultan yang dilantik oleh Belanda sebagai Yang DiPertuan Selangor. Orang ramai pula tidak mahu menjalankan perniagaan bijih timah dengan Belanda, selagi raja yang berhak tidak ditabalkan. Perdagang yang lain dibekukan terus kerana untuk membalas jasa beliau yang membantu Belanda menentang Riau, Johor dan Selangor. Di antara tiga orang Sultan juga dipandang oleh rakyat sebagai seorang sultan yang paling gigih. 1 | 2 SULTAN Sebagai ganti Sultan Ibrahim ditabalkan Raja Muhammad iaitu Raja Muda. Walaupun baginda bukan anak isteri pertama bergelar Sultan Muhammad bersemayam di Kuala Selangor juga. Pentadbiran baginda yang lemah itu menyebabkan Kuala Selangor menjadi sarang ioleh Cina di Lukut tidak diambil tindakan, sedangkan baginda sendiri banyak berhutang kepada 1'), 101 | ('MALTESE', ' ata ikteb messaġġ lil indirizzi differenti billi tagħżilhom u tagħfas il buttuna ikteb żid numri tfittxijja tal kotba mur print home kotba minn pagni ghal pagna minn ghall ktieb ta aċċessa stieden habib iehor grazzi it tim tal gruppi google'), 102 | ('MARATHI', 'हैदराबाद उच्चार ऐका (सहाय्य·माहिती)तेलुगू: హైదరాబాదు , उर्दू: حیدر آباد हे भारतातील आंध्र प्रदेश राज्याच्या राजधानीचे शहर आहे. हैदराबादची लोकसंख्या ७७ लाख ४० हजार ३३४ आहे. मोत्यांचे शहर अशी एकेकाळी ओळख असलेल्या या शहराला ऐतिहासिक, सांस्कृतिक आणि स्थापत्यशास्त्रीय वारसा लाभला आहे. १९९० नंतर शिक्षण आणि माहिती तंत्रज्ञान त्याचप्रमाणे औषधनिर्मिती आणि जैवतंत्रज्ञान क्षेत्रातील उद्योगधंद्यांची वाढ शहरात झाली. दक्षिण मध्य भारतातील पर्यटन आणि तेलुगू चित्रपटनिर्मितीचे हैदराबाद हे केंद्र आहे'), 103 | ('NEPALI', 'अरू ठाऊँबाटपनि खुलेको छ यो खाता अर अरू ठाऊँबाटपनि खुलेको छ यो खाता अर ू'), 104 | ('NORWEGIAN', ' a er obligatorisk tidsforskyvning plassering av katalogsøk planinformasjon loggfilbane gruppenavn kontoinformasjon passord domene gruppeinformasjon alle kampanjesporing alternativ bruker grupper oppgaveplanlegger oppgavehistorikk kontosammendrag antall'), 105 | ('PERSIAN', ' آب خوردن عجله می کردند به جای باز ی کتک کاری می کردند و همه چيز مثل قبل بود فقط من ماندم و يک دنيا حرف و انتظار تا عاقبت رسيد احضاريه ی ای با'), 106 | ('POLISH', ' a australii będzie widział inne reklamy niż użytkownik z kanady kierowanie geograficzne sprawia że reklamy są lepiej dopasowane do użytkownika twojej strony oznacza to także że możesz nie zobaczyć wszystkich reklam które są wyświetlane na'), 107 | ('PORTUGUESE', ' a abit prevê que a entrada desses produtos estrangeiros no mercado têxtil e vestuário do brasil possa reduzir os preços em cerca de a partir de má notícia para os empresários que terão que lutar para garantir suas margens de lucro mas boa notícia'), 108 | ('ROMANIAN', ' a anunţurilor reţineţi nu plătiţi pentru clicuri sau impresii ci numai atunci când pe site ul dvs survine o acţiune dorită site urile negative nu pot avea uri de destinaţie daţi instrucţiuni societăţii dvs bancare sau constructoare să'), 109 | ('ROMANIAN', 'оперативэ а органелор ши институциилор екзекутиве ши а органелор жудичиаре але путерий де стат фиекэруй орган ал путерий де стат и се'), 110 | ('RUSSIAN', ' а неправильный формат идентификатора дн назад'), 111 | ('SCOTS_GAELIC', ' air son is gum bi casg air a h uile briosgaid no gum faigh thu brath nuair a tha briosgaid a tighinn gad rannsachadh ghoogle gu ceart mura bheil briosgaidean ceadaichte cuiridh google briosgaid dha do neach cleachdaidh fa leth tha google a cleachdadh'), 112 | ('SERBIAN', 'балчак балчак на мапи србије уреди демографија у насељу балчак живи пунолетна становника а просечна старост становништва износи година'), 113 | ('SERBIAN', 'Društvo | četvrtak 1.08.2013 | 13:43 Krade se i izvorska voda Izvor: Gornji Milanovac -- U gružanskom selu Belo Polje prošle noći ukradeno je više od 10.000 litara kojima je obijen bazen. Bazen je bio zaključan i propisno obezbeđen.'), 114 | ('SLOVAK', ' a aktivovať reklamnú kampaň ak chcete kampaň pred spustením ešte prispôsobiť uložte ju ako šablónu a pokračujte v úprave vyberte si jednu z možností nižšie a kliknite na tlačidlo uložiť kampaň nastavenia kampane môžete ľubovoľne'), 115 | ('SLOVENIAN', ' adsense stanje prijave za google adsense google adsense račun je bil začasno zamrznjen pozdravljeni hvala za vaše zanimanje v google adsense po pregledu vaše prijavnice so naši strokovnjaki ugotovili da spletna stran ki je trenutno povezana z vašim'), 116 | ('SPANISH', ' a continuación haz clic en el botón obtener ruta también puedes desplazarte hasta el final de la página para cambiar tus opciones de búsqueda gráfico y detalles ésta es una lista de los vídeos que te recomendamos nuestras recomendaciones se basan'), 117 | ('SWAHILI', ' a ujumbe mpya jumla unda tafuta na angalia vikundi vya kujadiliana na kushiriki mawazo iliyopangwa kwa tarehe watumiaji wapya futa orodha hizi lugha hoja vishikanisho vilivyo dhaminiwa ujumbe sanaa na tamasha toka udhibitisho wa neno kwa haraka fikia'), 118 | ('SWEDISH', ' a bort objekt från google desktop post äldst meny öretag dress etaljer alternativ för vad är inne yaste google skrivbord plugin program för nyheter google visa nyheter som är anpassade efter de artiklar som du läser om du till exempel läser'), 119 | ('TAGALOG', ' a na ugma sa google ay nakaka bantog sa gitna nang kliks na nangyayari sa pamamagitan nang ordinaryong paggagamit at sa kliks na likha nang pandaraya o hindi tunay na paggamit bunga nito nasasala namin ang mga kliks na hindi kailangan o hindi gusto nang'), 120 | ('TURKISH', ' a ayarlarınızı görmeniz ve yönetmeniz içindir eğer kampanyanız için günlük bütçenizi gözden geçirebileceğiniz yeri arıyorsanız kampanya yönetimi ne gidin kampanyanızı seçin ve kampanya ayarlarını düzenle yi tıklayın sunumu'), 121 | ('UKRAINIAN', ' а більший бюджет щоб забезпечити собі максимум прибутків від переходів відстежуйте свої об яви за датою географічним розташуванням'), 122 | ('URDU', ' آپ کو کم سے کم ممکنہ رقم چارج کرتا ہے اس کی مثال کے طور پر فرض کریں اگر آپ کی زیادہ سے زیادہ قیمت فی کلِک امریکی ڈالر اور کلِک کرنے کی شرح ہو تو'), 123 | ('VIETNAMESE', ' adsense cho nội dung nhà cung cấp dịch vụ di động xác minh tín dụng thay đổi nhãn kg các ô xem chi phí cho từ chối các đơn đặt hàng dạng cấp dữ liệu ác minh trang web của bạn để xem'), 124 | ('WELSH', ' a chofrestru eich cyfrif ymwelwch a unwaith i chi greu eich cyfrif mi fydd yn cael ei hysbysu o ch cyfeiriad ebost newydd fel eich bod yn gallu cadw mewn cysylltiad drwy gmail os nad ydych chi wedi clywed yn barod am gmail mae n gwasanaeth gwebost'), 125 | ('YIDDISH', 'און פאנטאזיע ער איז באקאנט צים מערסטן פאר זיינע באַלאַדעס ער האָט געוווינט אין ווארשע יעס פאריס ליווערפול און לאנדאן סוף כל סוף איז ער'), 126 | ('SOMALI', ' a oo maanta bogga koobaad ugu qoran yahey beesha caalamka laakiin si kata oo beesha caalamku ula guntato soomaaliya waxa aan shaki ku jirin in aakhirataanka dadka soomaalida oo kaliya ay yihiin ku soomaaliya ka saari kara dhibka ay ku jirto'), 127 | ('IGBO', 'Chineke bụ aha ọzọ ndï omenala Igbo kpọro Chukwu. Mgbe ndị bekee bịara, ha mee ya nke ndi Christian. N\'echiche ndi ekpere chi Omenala Ndi Igbo, Christianity, Judaism, ma Islam, Chineke nwere ọtụtụ utu aha, ma nwee nanị otu aha. Ụzọ abụọ e si akpọ aha ahụ bụ Jehovah ma Ọ bụ Yahweh. Na ọtụtụ Akwụkwọ Nsọ, e wepụla aha Chineke ma jiri utu aha bụ Onyenwe Anyị ma ọ bụ Chineke dochie ya. Ma mgbe e dere akwụkwọ nsọ, aha ahụ bụ Jehova pụtara n’ime ya, ihe dị ka ugboro pụkụ asaa(7,000).'), 128 | ('HAUSA', ' a cikin a kan sakamako daga sakwannin a kan sakamako daga sakwannin daga ranar zuwa a kan sakamako daga guda daga ranar zuwa a kan sakamako daga shafukan daga ranar zuwa a kan sakamako daga guda a cikin last hour a kan sakamako daga guda daga kafar'), 129 | ('YORUBA', ' abinibi han ikawe alantakun le ni opolopo ede abinibi ti a to lesese bi eniyan to fe lo se fe lati se atunse jowo mo pe awon oju iwe itakunagbaye miran ti ako ni oniruru ede abinibi le faragba nipa atunse ninu se iwadi blogs ni ori itakun agbaye ti e ba'), 130 | ('ZULU', ' ana engu uma inkinga iqhubeka siza ubike kwi isexwayiso ngenxa yephutha lomlekeleli sikwazi ukubuyisela emuva kuphela imiphumela engaqediwe ukuthola imiphumela eqediwe zama ukulayisha kabusha leli khasi emizuzwini engu uma inkinga iqhubeka siza uthumele'), 131 | ('BOSNIAN', 'Novi predsjednik Mešihata Islamske zajednice u Srbiji (IZuS) i muftija dr. Mevlud ef. Dudić izjavio je u intervjuu za Anadolu Agency (AA) kako je uvjeren da će doći do vraćanja jedinstva među muslimanima i unutar Islamske zajednice na prostoru Sandžaka, te da je njegova ruka pružena za povratak svih u okrilje Islamske zajednice u Srbiji nakon skoro sedam godina podjela u tom dijelu Srbije. Dudić je za predsjednika Mešihata IZ u Srbiji izabran 4. januara, a zvanična inauguracija će biti obavljena u prvoj polovini februara. Kako se očekuje, prisustvovat će joj i reisu-l-ulema Islamske zajednice u Srbiji Husein ef. Kavazović koji će i zvanično promovirati Dudića u novog prvog čovjeka IZ u Srbiji. Dudić će danas boraviti u prvoj zvaničnoj posjeti reisu Kavazoviću, što je njegov privi simbolični potez nakon imenovanja. '), 132 | ('INDONESIAN', 'sukiyaki wikipedia indonesia ensiklopedia bebas berbahasa bebas berbahasa indonesia langsung ke navigasi cari untuk pengertian lain dari sukiyaki lihat sukiyaki irisan tipis daging sapi sayur sayuran dan tahu di dalam panci besi yang dimasak di atas meja makan dengan cara direbus sukiyaki dimakan dengan mence'), 133 | ('MALAY', 'sukiyaki wikipedia bahasa melayu ensiklopedia bebas sukiyaki dari wikipedia bahasa melayu ensiklopedia bebas lompat ke navigasi gelintar sukiyaki sukiyaki hirisan tipis daging lembu sayur sayuran dan tauhu di dalam periuk besi yang dimasak di atas meja makan dengan cara rebusan sukiyaki dimakan dengan mence'), 134 | ('FRENCH', fr_en_Latn), 135 | 136 | # This is just the "version marker": 137 | ('AZERBAIJANI', 'qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmxyzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas'), 138 | ) 139 | 140 | fullTestData = testData[:-1] + ( 141 | ('MONGOLIAN', 'ᠦᠭᠡ ᠵᠢᠨ ᠴᠢᠨᠭ᠎ᠠ ᠬᠦᠨᠳᠡᠢ ᠵᠢ ᠢᠯᠭᠠᠬᠣ'), 142 | ('X_Buginese', 'ᨄᨛᨑᨊᨒ ᨑᨗ ᨔᨒᨗᨓᨛ ᨕᨗᨋᨗᨔᨗ ᨒᨛᨄ ᨑᨛᨔᨛᨆᨗᨊ'), 143 | ('X_Gothic', '𐌰 𐌰𐌱𐍂𐌰𐌷𐌰𐌼 𐌰𐌲𐌲𐌹𐌻𐌹𐍃𐌺𐍃 𐌸𐌹𐌿𐌳𐌹𐍃𐌺𐍃 𐍆𐍂𐌰𐌲𐌺𐌹𐍃𐌺𐍃'), 144 | ('ABKHAZIAN', ' а зуа абзиара дақәшәоит ан лыбзиабара ахә амаӡам ауаҩы игәы иҭоу ихы иҿы ианубаалоит аҧҳәыс ҧшӡа ахацәа лышьҭоуп аҿаасҭа лара дрышьҭоуп'), 145 | ('AFAR', ' nagay tanito nagay tanto nagayna naharsi nahrur nake nala nammay nammay haytu nanu narig ne ni num numu o obare obe obe obisse oggole ogli olloyta ongorowe orbise othoga r rabe rade ra e rage rakub rasitte rasu reyta rog ruddi ruga s sa al bada sa ala'), 146 | ('AKAN', 'Wɔwoo Hilla Limann Mumu-Ɔpɛnimba 12 afe 1934. Wɔwoo no wɔ Gwollu wɔ Sisala Mantaw mu Nna ne maame yɛ Mma Hayawah. Ne papa so nna ɔyɛ Babini Yomu. Ɔwarr Fulera Limann ? Ne mba yɛ esuon-- Lariba Montia [wɔwoo no Limann]; Baba Limann; Sibi Andan [wɔwoo no Limann]; Lida Limann; Danni Limann; Zilla Limann na Salma Limann. Ɔtenaa ase kɔpemm Sanda-Kwakwa da ɛtɔ so 23 wɔ afe 1998 wɔ ?.'), 147 | ('AMHARIC', ' ለመጠይቅ ወደ እስክንድርያ ላኩዋቸውና የእስክንድርያ ጳጳስ አቴናስዮስ ፍሬምንጦስን እራሳቸውን ሾመው ልከዋል ከዚያ እስከ ዓ ም ድረስ የኢትዮጵያ አቡነ'), 148 | ('ASSAMESE', 'অঞ্চল নতুন সদস্যবৃন্দ সকলোৱে ভৰ্তি হব পাৰে মুল পৃষ্ঠা জন লেখক গুগ ল দল সাৰাংশ ই পত্ৰ টা বাৰ্তা এজন'), 149 | ('AYMARA', ' aru wijar aru ispañula ukaran aru witanam aru kurti aru kalis aru warani aru malta aru yatiyawi niya jakitanaka isluwiñ aru lmir phuran aru masirunan aru purtukal aru kruwat aru jakira urtu aru inklisa pirsan aru suyku aru malay aru jisk aptayma thaya'), 150 | ('BASHKIR', ' арналђан бындай ђилми эш тіркињлњ тњјге тапєыр нњшер ителњ ғинуар бєхет именлектє етешлектє ауыл ўќмерџєре хеџмєт юлын ћайлаѓанда'), 151 | ('BISLAMA', ' king wantaem nomo hem i sakem setan mo ol rabis enjel blong hem oli aot long heven oli kamdaon long wol taswe ol samting oli kam nogud olgeta long wol ya stat long revelesen ol faet kakae i sot ol sik mo fasin blong brekem loa oli kam antap olgeta samting'), 152 | ('BRETON', ' a chom met leuskel a ra e blas da jack irons dilabour hag aet kuit eus what is this dibab a reont da c houde michael beinhorn evit produiñ an trede pladenn kavet e vez ar ganaouennoù buhan ha buhan ganto setu stummet ar bladenn adkavet e vez enni funk'), 153 | ('BURMESE', ' တက္ကသုိလ္ မ္ဟ ပ္ရန္ လာ္ရပီးေနာက္ န္ဟစ္ အရ္ဝယ္ ဦးသန္ ့သည္ ပန္ းတနော္ အမ္ယုိးသား ေက္ယာင္ း'), 154 | ('CORSICAN', ' a prupusitu di risultati for utilizà a scatula per ricercà ind issi risultati servore errore u servore ha incuntratu una errore pruvisoria é ùn ha pussutu compie a vostra dumanda per piacè acimenta dinò ind una minuta tuttu listessu ligami truvà i'), 155 | ('DZONGKHA', ' རྩིས བརྐྱབ ཚུལ ལྡན དང ངེས བདེན སྦ སྟོན ནིའི དོན ལུ ཁྱོད གུག ཤད ལག ལེན འཐབ དགོ ག དང ཨིན པུཊི གྲལ ཐིག གུ'), 156 | ('ESPERANTO', ' a jarcento refoje per enmetado de koncerna pastro tiam de reformita konfesio ekde refoje ekzistis luteranaj komunumanoj tamen tiuj fondis propran komunumon nur en ambaŭ apartenis ekde al la evangela eklezio en prusio resp ties rejnlanda provinceklezio en'), 157 | ('FAROESE', ' at verða átaluverdar óhóskandi ella áloypandi vit kunnu ikki garanterða at google leitanin ikki finnur naka sum er áloypandi óhóskandi ella átaluvert og google tekur onga ábyrgd yvir tær síður sum koma við í okkara leitiskipan fá tær ein'), 158 | ('FIJIAN', ' i kina na i iri ka duatani na matana main a meke wesi se meke mada na meke ni yaqona oqo na meke ka dau vakayagataki ena yaqona vakaturaga e dau caka toka ga kina na vucu ka dau lagati tiko kina na ka e yaco tiko na talo ni wai ni yaqona na lewai ni wai'), 159 | ('FRISIAN', ' adfertinsjes gewoan lytse adfertinsjes mei besibbe siden dy t fan belang binne foar de ynhâld fan jo berjochten wolle jo mear witte fan gmail foardat jo jo oanmelde gean dan nei wy wurkje eltse dei om gmail te ferbetterjen dêrta sille wy jo sa út en'), 160 | ('GREENLANDIC', ' at nittartakkalli uani toqqarsimasatta akornanni nittartakkanut allanut ingerlaqqittoqarsinnaavoq kanukoka tassaavoq kommuneqarfiit kattuffiat nuna tamakkerlugu kommunit nittartagaannut ingerlaqqiffiusinnaasoq kisitsiserpassuit nunatsinnut tunngasut'), 161 | ('GUARANI', ' aháta añe ë ne mbo ehára ndive ajeruréta chupe oporandujey haĝua peëme mba épa pekaru ha áĝa oporandúvo nde eréta avei re paraguaýpe kachíke he i leúpe ndépa re úma kure tatakuápe ha leu ombohovái héë ha ujepéma kachíke he ijey'), 162 | ('HAWAIIAN', 'He puke noiʻi kūʻikena kūnoa ʻo Wikipikia. E ʻoluʻolu nō, e hāʻawi mai i kāu ʻike, kāu manaʻo, a me kou leo no ke kūkulu ʻana a me ke kākoʻo ʻana mai i ka Wikipikia Hawaiʻi. He kahua pūnaewele Hawaiʻi kēia no ka hoʻoulu ʻana i ka ʻike Hawaiʻi. Inā hiki iā ʻoe ke ʻōlelo Hawaiʻi, e ʻoluʻolu nō, e kōkua mai a e hoʻololi i nā ʻatikala ma ʻaneʻi, a pono e haʻi aku i kou mau hoa aloha e pili ana i ka Wikipikia Hawaiʻi. E ola mau nō ka ʻōlelo Hawaiʻi a mau loa aku.'), 163 | ('IGBO', 'Chineke bụ aha ọzọ ndï omenala Igbo kpọro Chukwu. Mgbe ndị bekee bịara, ha mee ya nke ndi Christian. N\'echiche ndi ekpere chi Omenala Ndi Igbo, Christianity, Judaism, ma Islam, Chineke nwere ọtụtụ utu aha, ma nwee nanị otu aha. Ụzọ abụọ e si akpọ aha ahụ bụ Jehovah ma Ọ bụ Yahweh. Na ọtụtụ Akwụkwọ Nsọ, e wepụla aha Chineke ma jiri utu aha bụ Onyenwe Anyị ma ọ bụ Chineke dochie ya. Ma mgbe e dere akwụkwọ nsọ, aha ahụ bụ Jehova pụtara n’ime ya, ihe dị ka ugboro pụkụ asaa(7,000).'), 164 | ('INTERLINGUA', ' super le sitos que tu visita isto es necessari pro render disponibile alcun functionalitates del barra de utensiles a fin que nos pote monstrar informationes ulterior super un sito le barra de utensiles debe dicer a nos le'), 165 | ('INTERLINGUE', ' abhorre exceptiones in li derivation plu cardinal por un l i es li regularità del flexion conjugation ples comparar latino sine flexione e li antiqui projectes naturalistic queles have quasi null regules de derivation ma si on nu examina li enunciationes'), 166 | ('INUPIAK', 'sabvaqjuktuq sabvaba atiqaqpa atiqaqpa ibiq iebiq ixafich niuqtulgiññatif uvani natural gas tatpikka ufasiksigiruaq maaffa savaannafarufa mi tatkivani navy qanuqjugugguuq taaptuma inna uqsrunik ivaqjiqhutik taktuk allualiuqtuq sigukun nanuq puuvraatuq taktuum amugaa kalumnitigun nanuq agliruq allualiuqtuq'), 167 | ('KASHMIRI', ' ژماں سرابن منز گرٲن چھِہ خابٕک کھلونہٕ ؤڈراواں تُلتِھ نِیَس تہٕ گوشہِ گوشہِ مندچھاوى۪س دِلس چھُہ وون٘ت وُچھان از ستم قلم صبوٝرٕ وول مسٲفر لیۆکھُن بێتابن منز ورل سوال چھُہ تراواں جوابن منز کالہٕ پھۯستہٕ پھن٘ب پگَہہ پہ پۆت نظر دِژ نہٕ ژھالہٕ مٔت آرن مٲنز مسول متھان چھےٚ مس والن وۅن چھےٚ غارن تہِ نارٕ ژھٹھ ژاپان رێش تۅرگ تراوٕہن تہٕ ون رٹہٕ ہن ہوشہِ ہێۆچھ نہٕ پوشنوٝلس نِش مۅہرٕ دی دی زٕلاں چھِ زى۪و حرفن لۆدرٕ پھٔل ہى۪تھ ملر عازمؔ سۆدرٕ کھۅنہِ منز منگاں چھُہ ندرى۪ن پن ژے تھى۪کی یہِ مسٲفر پنن وُڈو تہٕ پڑاو گٕتَو گٕتَو چھےٚ یہِ کۅل بُتھ تہٕ بانہٕ سٕہہ گۅردٕ چھہِ سپداں دمہٕ پُھٹ چھِٹہ پونپر پکھہٕ داران سُہ یتى۪ن تۯاوِ کم نظر دۯاکھ تہٕ باسیوے سُہ مۆہ ہیو یێران مےٚ ژى۪تُرمُت چھُہ سُلی تس چھےٚ کتى۪ن تھپھ شاد مس کراں وُچھ مےٚ خون ژٕ خبر کیازِ کراں دۯاکھ تمِس پى۪ٹھ ماتم أز کہِ شبہٕ آو مےٚ بێیہِ پیش سفر زانہِ خدا دارِ پى۪ٹھ ژٲنگ ہنا تھو زِ ژے چھےٚ مێون أنہٕ کپٹاں چھُہ زٕژن سون مظفّر عازمؔ پوشہ برگن چھُہ سُواں چاکھ سُہ الماس قلم لوِ کٔ ڈ نوِ سرٕ سونتس کل پروِ بۆر بێیہ از بانبرِ ہۆت یمبرزلہِ ٹارى۪ن منز نار وزملہِ کۅسہٕ کتھ کٔر اظہار کچھہِ منزٕ ؤن رووُم اچھہِ چشمو ژوپُم کٔنڈ انبار تماشہِ چھہِ تگاں'), 168 | ('KAZAKH', ' ﺎ ﻗﻴﺎﻧﺎﺕ ﺑﻮﻟﻤﺎﻳﺪﻯ ﺑﯘﻝ ﭘﺮﻭﺗﺴﻪﺳﯩﻦ ﻳﺎﻋﻨﻲ ﻗﺎﻻ ﻭﻣﯩﺮﯨﻨﺪﻩ ﻗﺎﺯﺍﻕ ء ﺗﯩﻠﯩﻨﯩﯔ ﻗﻮﻟﺪﺍﻧﯩﻠﻤﺎﯞﻯ ﻗﺎﺯﺍﻕ ﺟﻪﺭﯨﻨﺪﻩ'), 169 | ('KAZAKH', ' а билердің өзіне рұқсат берілмеген егер халық талап етсе ғана хан келісім берген өздеріңіз білесіздер қр қыл мыс тық кодексінде жазаның'), 170 | ('KHASI', ' kaba jem jai sa sngap thuh ia ki bynta ba sharum naka sohbuin jong phi nangta sa pynhiar ia ka kti kadiang jong phi sha ka krung jong phi bad da kaba pyndonkam kumjuh ia ki shympriahti jong phi sa sngap thuh shapoh ka tohtit jong phi pyndonkam ia kajuh ka'), 171 | ('KURDISH', ' بۆ به ڕێوه بردنی نامه ی که دێتن ڕاسته وخۆ ڕه وان بکه نامه کانی گ مایل بۆ حسابی پۆستێکی تر هێنانی په یوه ندکاره کان له'), 172 | ('KYRGYZ', ' جانا انى تانۇۇ ۇلۇتۇن تانۇۇ قىرعىزدى بئلۉۉ دەگەندىك اچىق ايتساق ماناستى تاانىعاندىق ۅزۉڭدۉ تاانىعاندىق بۉگۉن تەما جۉكتۅمۅ ق ى رع ى ز ت ى ل ى'), 173 | ('KYRGYZ', ' агай эле оболу мен садыбакас аганын өзү менен эмес эмгектери менен тааныштым жылдары ташкенде өзбекстан илимдер академиясынын баяны'), 174 | ('LATIN', ' a deo qui enim nocendi causa mentiri solet si iam consulendi causa mentiatur multum profecit sed aliud est quod per se ipsum laudabile proponitur aliud quod in deterioris comparatione praeponitur aliter enim gratulamur cum sanus est homo aliter cum melius'), 175 | ('LINGALA', ' abakisamaki ndenge esengeli moyebami abongisamaki solo mpenza kombo ya moyebami elonguamaki kombo ya bayebami elonguamaki kombo eleki molayi po na esika epesameli limbisa esika ya kotia ba kombo esuki boye esengeli olimbola ndako na yo ya mikanda kombo'), 176 | ('LUXEMBOURGISH', ' a gewerkschaften och hei gefuerdert dir dammen an dir häre vun de gewerkschaften denkt un déi aarm wann der äer fuerderunge formuléiert d sechst congés woch an aarbechtszäitverkierzung hëllefen hinnen net d unhiewe vun de steigerungssäz bei de'), 177 | ('MALAGASY', ' amporisihin i ianao mba hijery ny dika teksta ranofotsiny an ity lahatsoratra ity tsy ilaina ny opérateur efa karohina daholo ny teny rehetra nosoratanao ampiasao anaovana dokambarotra i google telugu datin ny takelaka fikarohana sary renitakelak i'), 178 | ('MALAY', 'bilik sebelah berkata julai pada pm ladymariah hmm sume ni terpulang kepada individu mungkin anda bernasib baik selama ini dalam membeli hp yang bagus deli berkata julai pada pm walaupun bukan bahsa baku tp tetap bahasa melayu kan perubahan boleh dibuat'), 179 | ('MANX', ' and not ripe as i thought yn assyl yn shynnagh as yn lion the ass the fox and the lion va assyl as shynnagh ayns commee son nyn vendeilys as sauchys hie ad magh ayns y cheyll dy shelg cha row ad er gholl feer foddey tra veeit ad rish lion yn shynnagh'), 180 | ('MAORI', ' haere ki te kainga o o haere ki te kainga o o haere ki te kainga o te rapunga ahua o haere ki te kainga o ka tangohia he ki to rapunga kaore au mohio te tikanga whakatiki o te ra he whakaharuru te pai rapunga a te rapunga ahua a e kainga o nga awhina o te'), 181 | ('MAURITIAN_CREOLE', 'Anz dir mwa, Sa bann delo ki to trouve la, kot fam prostitie asize, samem bann pep, bann lafoul dimoun, bann nasion ek bann langaz. Sa dis korn ki to finn trouve, ansam avek bebet la, zot pou ena laenn pou prostitie la; zot pou pran tou seki li ena e met li touni, zot pou manz so laser e bril seki reste dan dife. Parski Bondie finn met dan zot leker proze pou realiz so plan. Zot pou met zot dakor pou sed zot pouvwar bebet la ziska ki parol Bondie fini realize.'), 182 | ('MONGOLIAN', ' а боловсронгуй болгох орон нутгийн ажил үйлсийг уялдуулж зохицуулах дүрэм журам боловсруулах орон нутгийн өмч хөрөнгө санхүүгийн'), 183 | ('NAURU', ' arcol obabakaen riringa itorere ibibokiei ababaro min kuduwa airumena baoin tokin rowiowet itiket keram damadamit eigirow etoreiy row keitsito boney ibingo itsiw dorerin naoerodelaporte s nauruan dictionary a c a c d g h o p s t y aiquen ion eins aiquen'), 184 | ('NDEBELE', "ikomiti elawulako yegatja emhlanganweni walo ]imithetho mgomo ye anc ibekwa malunga wayo begodu ubudosiphambili kugandelela lokho okutjhiwo yi lokha nayithi abantu ngibo "), 185 | ('NORWEGIAN_N', ' a for verktylina til å hjelpa deg å nå oss merk at pagerank syninga ikkje automatisk kjem til å henta inn informasjon frå sider med argument dvs frå sider med eit i en dersom datamaskina di er plassert bak ein mellomtenar for vevsider kan det verka'), 186 | ('NYANJA', 'Boma ndi gawo la dziko lomwe linapangidwa ndi cholinga chothandiza ntchito yolamulira. Kuŵalako kulikuunikabe mandita, Edipo nyima unalephera kugonjetsa kuŵalako.'), 187 | ('OCCITAN', ' Pasmens, la classificacion pus admesa uei (segon Juli Ronjat e Pèire Bèc) agropa lei parlars deis Aups dins l\'occitan vivaroaupenc e non dins lo dialècte provençau.'), 188 | ('OROMO', ' afaan katalaa bork bork bork hiikaa jira hin argamne gareen barbaadame hin argamne gargarsa qube en gar bayee jira garee walitti firooman gareewwan walitti firooman fuula web akka tartiiba qubeetiin agarsiisi akka tartiiba qubeetiin agarsiisaa jira akka'), 189 | ('PASHTO', ' اتو مستقل رياست جوړ شو او د پخواني ادبي انجمن څانګې ددې رياست جز شوی او ددې انجمن د ژبې مديريت د پښتو ټولنې په لوی مديريت واوښت لوی مدير يې د'), 190 | ('PEDI', 'Bophara bja Asia ekaba 8.6% bja lefase goba 29.4% bja naga ya lefase (ntle le mawatle). Asia enale badudu bao bakabago dimillione millione tše nne (4 billion) yeo e bago 60% ya badudi ba lefase ka bophara. A bapolelwa rena sefapanong mehleng ya Pontius Pilatus. A hlokofatšwa, A bolokwa, A tsoga ka letšatši la boraro, ka mo mangwalo a bolelago ka gona, a rotogela magodimong, '), 191 | ('QUECHUA', ' is t ipanakunatapis rikuchinankupaq qanpa simiykipi noqaykoqpa uya jllanakunamanta kunan jamoq simikunaman qelqan tiyan watukuy qpa uyata qanpa llaqtaykipi llank anakuna simimanta yanapakuna simimanta mayqen llaqtallapis kay simimanta t ijray qpa qelqa'), 192 | ('RHAETO_ROMANCE', ' Cur ch’il chantun Turitg ha dà il dretg da votar a las dunnas (1970) è ella vegnida elegida en il cussegl da vischnanca da Zumikon per la Partida liberaldemocratica svizra (PLD). Da 1974 enfin 1982 è ella stada presidenta da vischnanca da Zumikon. L’onn 1979 è Elisabeth Kopp vegnida elegida en il Cussegl naziunal e reelegida quatter onns pli tard cun in resultat da sur 100 000 vuschs. L’onn 1984 è ella daventada vicepresidenta da la PLD.'), 193 | ('RUNDI', ' ishaka mu ndero y abana bawe ganira n abigisha nimba hari ingorane izo ari zo zose ushobora gusaba kubonana n umwigisha canke kuvugana nawe kuri terefone inyuma y uko babarungikira urutonde rw amanota i muhira mu bisanzwe amashure aratumira abavyeyi'), 194 | ('SAMOAN', ' autu mea o lo totonu le e le minaomia matou te tuu i totonu i le faamatalaina o le suesuega i taimi uma mea o lo totonu fuafua i mea e tatau fa afoi tala mai le newsgroup mataupu fa afoi mai tala e ai le mataupu e ai totonu tusitala o le itu o faamatalaga'), 195 | ('SANGO', ' atâa na âkotta zo me lâkwê angbâ gï tarrango nî âkotta zo tî koddoro nî âde agbû tenne nî na kate töngana mbênî kotta kpalle tî nzönî dutï tî halëzo pëpe atâa sô âla lü gbâ tî ândya tî mâi na sahngo asâra gbâ tî'), 196 | ('SANSKRIT', ' ं क र्मणस् त स्य य त्कि ङ्चेह करो त्यय ं त स्माल् लोका त्पु नरै ति अस्मै लोका य क र्मण इ ति नु काम'), 197 | ('SANSKRIT', ' brahmā tatraivāntaradhīyata tataḥ saśiṣyo vālmīkir munir vismayam āyayau tasya śiṣyās tataḥ sarve jaguḥ ślokam imaṃ punaḥ muhur muhuḥ prīyamāṇāḥ prāhuś ca bhṛśavismitāḥ samākṣaraiś caturbhir yaḥ pādair gīto'), 198 | ('SCOTS', ' a gless an geordie runciman ower a gless an tamson their man preached a hale hoor aboot the glorious memories o forty three an backsliders an profane persons like esau an aboot jeroboam the son o nebat that gaed stravagin to anither kirk an made aa israel'), 199 | ('SESELWA', 'Sesel ou menm nou sel patri. Kot nou viv dan larmoni. Lazwa, lanmour ek lape. Nou remersye Bondye. Preserv labote nou pei. Larises nou losean. En leritaz byen presye. Pour boner nou zanfan. Reste touzour dan linite. Fer monte nou paviyon. Ansanm pou tou leternite. Koste Seselwa!'), 200 | ('SESOTHO', ' bang ba nang le thahasello matshwao a sehlooho thuto e thehilweng hodima diphetho ke tsela ya ho ruta le ho ithuta e totobatsang hantle seo baithuti ba lokelang ho se fihlella ntlhatheo eo e sebetsang ka yona ke ya hore titjhere o hlakisa pele seo'), 201 | ('SHONA', ' chete vanyori vanotevera vakabatsira kunyora zvikamu zvino kumba home tinyorere tsamba chikamu chakumbirwa hachina kuwanikwa chikamu ichi cheninge chakayiswa kuimwe nzvimbo mudhairekitori rino chimwe chikamu chopadhuze pane chinhu chatadza kushanda bad'), 202 | ('SINDHI', ' اضافو ٿي ٿيو پر اها خبر عثمان کي بعد پيئي ته سگريٽ ڇڪيندڙ مسلمان نه هو بلڪ هندو هو دڪان تي پهچي عثمان ڪسبت کولي گراهڪن جي سيرب لاهڻ شروع ڪئي پر'), 203 | ('SISWANT', ' bakhokhintsela yesikhashana bafake imininingwane ye akhawunti leliciniso kulelifomu nangabe akukafakwa imininingwane leliciniso imali lekhokhiwe angeke ifakwe kumkhokhintsela lofanele imininingwane ye akhawunti ime ngalendlela lelandzelako inombolo'), 204 | ('SUNDANESE', 'Nu ngatur kahirupan warga, keur kapentingan pamarentahan diatur ku RT, RW jeung Kepala Dusun, sedengkeun urusan adat dipupuhuan ku Kuncen jeung kepala adat. Sanajan Kampung Kuta teu pati anggang jeung lembur sejenna nu aya di wewengkon Desa Pasir Angin, tapi boh wangunan imah atawa tradisi kahirupan masarakatna nenggang ti nu lian.'), 205 | ('TAJIK', ' адолат ва инсондӯстиро бар фашизм нажодпарастӣ ва адоват тарҷеҳ додааст чоп кунед ба дигарон фиристед чоп кунед ба дигарон фиристед'), 206 | ('TATAR', 'ачарга да бирмәде чәт чәт килеп тора безнең абыйнымы олы абыйнымы эштән'), 207 | ('TATAR', ' alarnı eşkärtü proğramnarın eşläwen däwam itü tatar söylämen buldıru wä sizep alu sistemnarın eşläwen däwat itü häm başqalar yılnıñ mayında tatar internetı ictimağıy oyışması milli ts isemle berençe däräcäle häm tat'), 208 | ('TIBETAN', ' ་གྱིས་ཁ་ཆེའི་ཕྱག་འཚལ་ཁང་ཞིག་བཤིག་སྲིད་པ། ཡར་ཀླུང་གཙང་པོར་ཆ ུ་མཛོང་བརྒྱག་རྒྱུའི་ལས་འཆར་ལ་རྒྱ་གར་གྱི་སེམས་ཚབས། རྒྱ་གརགྱི་མཚོ་འོག་དམག་གྲུར་སྦར་གས་བྱུང་བ། པ་ཀི་སི་ཏན་གྱིས་རྒྱ་གར་ལ་མི་སེར་བསད་པའི་སྐྱོན་འཛུགས་བྱས་པ། རྩོམ་ཡིག་མང་བ། འབྲེལ་མཐུད་བརྒྱུད་ལམ། ཐོན་སྐྱེད་དང་སྲི་ཞུ། ་ཐོག་དེབ་བཞི་ དཔར་འགྲེམས་གནང་ཡོད་པ་དང་བོད་ཡིག་དྲ་ཚིགས་ཁག་ནང་ལ་ཡང་རྩོམ་ཡང་ཡང་བྲིས་གནང་མཁན་རེད། ལེ་ཚན་ཁག ལེ་ཚན་ཁག འབྲེལ་ཡོད། འགྲེམ་སྟོན། རྒྱུད་ལམ་སྣ་མང་ཡིག་མཛོད། བཀོལ་སྤྱོད་པའི་འཇོག་ཡུལ་དྲ་ངོས། སྔོན་མ། རྗེས་མ། བསྟན་འཛིན་བདེ་སྐྱིད། ཚེ་རིང་རྣམ་རྒྱལ། བསྟན་འཛིན་ངག་དབང་། ཡོལ་གདོང་ཚེ་རིང་ལྷག་པ། ་དབང་ ཕྱུག་གཉིས་ཀྱིས་བརྗོད་གཞི་བྱེ་བྲག་པ་ཞིག་ལ་བགྲོ་གླེང་གཏིང་ཟབ་བྱེད་པའི་གཟའ་ འཁོར་གཉིས་རེའི་མཚམས་ཀྱི་ལེ་ཚན་ཞིག་ཡིན། དཔྱད་ཞིབ་ཀྱིས་རྒྱ་ནག་ནང་ཁུལ་གྱི་འགྱུར་ལྡོག་དང༌། རྒྱ་ནག་དང་རྒྱལ་སྤྱིའི་འབྲེལ་བར་དམིགས་སུ་བཀར་ནས་བགྲོ་གླེང་བྱེད་ཀྱི་ཡོད།། རྒྱང་སྲིང་དུས་ཚོད།'), 209 | ('TIGRINYA', ' ሃገር ተረፎም ዘለዉ ኢትዮጵያውያን ኣብቲ ምስ ኢትዮጵያ ዝዳውብ ኣውራጃ ደቡብ ንኽነብሩ ኣይፍቀደሎምን እዩ ካብ ሃገር ንኽትወጽእ ዜጋ ኹን ወጻእተኛ ናይ'), 210 | ('TONGA', ' a ke kumi oku ikai ke ma u vakai ki hono hokohoko faka alafapeti api pe ko e uluaki peesi a ho o fekumi faka malatihi fekumi ki he lea oku fakaha atu pe ko ha fonua fekumi ki he fekumi ki he peesi oku ngaahi me a oku sai imisi alu ki he ki he ulu aki'), 211 | ('TSONGA', ' a ku na timhaka leti nga ta vulavuriwa na google google yi hlonipha yi tlhela yi sirheleta vanhu hinkwavo lava tirhisaka google toolbar ku dyondza hi vusireleli eka system ya hina hi kombela u hlaya vusireleli bya hina eka toolbar mbulavulo wu tshikiwile'), 212 | ('TSWANA', ' go etela batla ditsebe tsa web tse di nang le le batla ditsebe tse di golaganya le tswang mo leka go batla web yotlhe batla mo web yotlhe go bona home page ya google batla mo a o ne o batla gore a o ne o batla ditsebe tsa bihari batla mo re maswabi ga go'), 213 | ('TURKMEN', ' айдянларына ынанярмыка эхли боз мейданлары сурулип гутарылан тебигы ота гарып гумлукларда миллиондан да артыкмач ири шахлы малы миллиона'), 214 | ('TURKMEN', ' akyllylyk çyn söýgi üçin böwet däl de tebigylykdyr duýgularyň gödeňsiligi aç açanlygy bahyllygy söýgini betnyşanlyk derejesine düşürýändir söýeni söý söýmedige süýkenme özüni söýmeýändigini görmek ýigit üçin uly'), 215 | ('AKAN', ' amammui tumidifo no bɛtow ahyɛ atoro som so mpofirim na wɔasɛe no pasaa ma ayɛ nwonwa dɛn na ɛbɛka wɔn ma wɔayɛ saa bible no ma ho mmuae wɔ adiyisɛm nhoma no mu sɛ onyankopɔn na ɔde hyɛɛ wɔn komam sɛ wɔmma ne nsusuwii mmra mu'), 216 | ('UIGHUR', ' ئالەملەرنىڭ پەرۋەردىگارىدىن تىلەيمەن سىلەر بۇ يەرلەردە باغچىلاردىن بۇلاقلاردىن زىرائەتلەردىن يۇمشاق پىشقان خورمىلاردىن بەھرىمەن بولۇپ'), 217 | ('UIGHUR', ' а башлиди әмма бу қетимқи канада мәтбуатлириниң хәвәрлиридә илгирикидәк хитай һөкүмәт мәтбуатлиридин нәқил алидиған вә уни көчүрүп'), 218 | ('UZBEK', ' آرقلی بوتون سیاسی حزب و گروه لرفعالیتیگه رخصت بیرگن اخبارات واسطه لری شو ییل مدتیده مثال سیز ترقی تاپکن و اهالی نینگ اقتصادی وضعیتی اوتمیش'), 219 | ('UZBEK', ' а гапирадиган бўлсак бунинг иккита йўли бор биринчиси мана шу қуриган сатҳини қумликларни тўхтатиш учун экотизимни мустаҳкамлаш қумга'), 220 | ('UZBEK', ' abadiylashtirildi aqsh ayol prezidentga tayyormi markaziy osiyo afg onistonga qanday yordam berishi mumkin ukrainada o zbekistonlik muhojirlar tazyiqdan shikoyat qilmoqda gruziya va ukraina hozircha natoga qabul qilinmaydi afg oniston o zbekistonni g'), 221 | ('VENDA', 'Vho ṱanganedzwa kha Wikipedia nga tshiVenḓa. Vhadivhi vha manwalo a TshiVenda vha talusa divhazwakale na vhubvo ha Vhavenda ngau fhambana. Vha tikedza mbuno dzavho uya nga mawanwa a thoduluso dze vha ita. Vhanwe vha vhatodulusi vhari Vhavenda vho tumbuka Afrika vhukati vha tshimbila vha tshiya Tshipembe ha Afrika, Rhodesia hune ha vho vhidzwa Zimbagwe namusi.'), 222 | ('VOLAPUK', ' brefik se volapükavol nüm balid äpubon ün dü lif lölik okas redakans älaipübons gasedi at nomöfiko äd ai mu kuratiko pläo timü koup nedäna fa ns deutän kü päproibon fa koupanef me gased at ästeifülom ad propagidön volapüki as sam ün'), 223 | ('WARAY_PHILIPPINES', 'Amo ini an balay han Winaray o Binisaya nga Lineyte-Samarnon nga Wikipedia, an libre ngan gawasnon nga ensayklopedya nga bisan hin-o puyde magliwat o mag-edit. An Wikipedia syahan gintikang ha Iningles nga yinaknan han tuig 2001. Ini nga bersyon Winaray gintikang han ika-25 han Septyembre 2005 ngan ha yana mayda 514,613 nga artikulo. Kon karuyag niyo magsari o magprobar, pakadto ha . An Gastrotheca pulchra[2] in uska species han Anura nga ginhulagway ni Ulisses Caramaschi ngan Rodrigues hadton 2007. An Gastrotheca pulchra in nahilalakip ha genus nga Gastrotheca, ngan familia nga Hemiphractidae.[3][4] Ginklasipika han IUCN an species komo kulang hin datos.[1] Waray hini subspecies nga nakalista.[3]'), 224 | ('WOLOF', ' am ak dëgg dëggam ak gëm aji bind ji te gëstu ko te jëfandikoo tegtalu xel ci saxal ko sokraat nag jëfandikoo woon na xeltu ngir tas jikko yu rafet ci biir nit ñi ak dëggu ak soppante sokraat nag ñëw na mook aflaton platon sukkandiku ci ñaari'), 225 | ('XHOSA', ' a naynga zonke futhi libhengezwa kwiwebsite yebond yasemzantsi afrika izinga elisebenzayo xa usenza olu tyalo mali liya kusebenza de liphele ixesha lotyalo mali lwakho inzala ihlawulwa rhoqo emva kweenyanga ezintandathu ngomhla wamashumi amathathu ananye'), 226 | ('X_KLINGON', ' a ghuv bid soh naq jih lodni yisov chich wo vamvo qeylis lunge pu chah povpu vodleh a dah ghah cho ej dah wo che pujwi bommu tlhegh darinmohlahchu pu majqa horey so lom qa ip quv law may vad suvtahbogh wa sanid utlh quv pus datu pu a vitu chu pu johwi tar'), 227 | ('X_PIG_LATIN', ' away ackupbay editcray ardcay ybay isitingvay ouryay illingbay eferencespray agepay orway isitvay ethay adwordsway elphay entrecay orfay oremay etailsday adwordsway ooglegay omcay upportsay'), 228 | ('ZHUANG', ' dih yinzminz ndaej daengz bujbienq youjyau dih cingzyin caeuq cinhingz diuz daihit boux boux ma daengz lajmbwn couh miz cwyouz cinhyenz caeuq genzli bouxboux bingzdaengj gyoengq vunz miz lijsing caeuq liengzsim wngdang daih gyoengq de lumj beixnuengx'), 229 | 230 | # This is just the "version marker": 231 | ('ICELANDIC', 'qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmxyzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas'), 232 | ) 233 | 234 | class TestCLD(unittest.TestCase): 235 | 236 | langsSeen = set() 237 | fullLangsSeen = set() 238 | 239 | def runOne(self, expectedLangName, s, doFull = False): 240 | if VERBOSE: 241 | print('') 242 | print('Test: %s [%d bytes]' % (expectedLangName, len(s))) 243 | failed = False 244 | for isPlainText in False, True: 245 | if doFull: 246 | detector = cld2full.detect 247 | else: 248 | detector = cld2.detect 249 | isReliable, textBytesFound, details = detector(s, isPlainText=isPlainText) 250 | if len(details) > 0: 251 | detectedLangName, detectedLangeCode = details[0][:2] 252 | 253 | if VERBOSE: 254 | print(' detected: %s' % detectedLangName) 255 | print(' reliable: %s' % (isReliable != 0)) 256 | print(' textBytes: %s' % textBytesFound) 257 | print(' details: %s' % str(details)) 258 | 259 | try: 260 | self.assertEqual(expectedLangName, detectedLangName, '%s != %s; details: %s' % (detectedLangName, expectedLangName, str(details))) 261 | except: 262 | traceback.print_exc() 263 | failed = True 264 | break 265 | if doFull: 266 | self.fullLangsSeen.add(detectedLangName) 267 | else: 268 | self.langsSeen.add(detectedLangName) 269 | else: 270 | try: 271 | self.fail('no language detected; expected %s' % expectedLangName) 272 | except: 273 | traceback.print_exc() 274 | failed = True 275 | break 276 | 277 | if failed: 278 | self.fail('some languages were wrong') 279 | 280 | def test_basic(self): 281 | for lang, text in testData: 282 | self.runOne(lang, text) 283 | for lang, text in fullTestData: 284 | self.runOne(lang, text, True) 285 | 286 | # End of per-language tests; start tests for specific functions: 287 | def test_vectors(self): 288 | for detector in cld2, cld2full: 289 | for lang, text in testData: 290 | isReliable, textBytesFound, details, vectors = detector.detect(text, returnVectors=True) 291 | self.assertTrue(textBytesFound > 0) 292 | if text == fr_en_Latn: 293 | if detector == cld2: 294 | self.assertEqual(4, len(vectors)) 295 | self.assertEqual(('en', 'fr', 'un', 'en'), tuple(x[3] for x in vectors)) 296 | #print('small: %s' % str(vectors)) 297 | else: 298 | self.assertEqual(3, len(vectors)) 299 | self.assertEqual(('en', 'fr', 'en'), tuple(x[3] for x in vectors)) 300 | #print('large: %s' % str(vectors)) 301 | 302 | def test_encoding_hint(self): 303 | for detector in cld2, cld2full: 304 | for lang, text in testData: 305 | for encoding in cld2.ENCODINGS: 306 | detector.detect(text, hintEncoding=encoding) 307 | 308 | def test_language_hint(self): 309 | for detector in cld2, cld2full: 310 | for lang, text in testData: 311 | for langHint in cld2.LANGUAGES: 312 | detector.detect(text, hintLanguage=langHint[0]) 313 | detector.detect(text, hintLanguage=langHint[1]) 314 | 315 | def test_top_level_domain_hint(self): 316 | for detector in cld2, cld2full: 317 | for lang, text in testData: 318 | detector.detect(text, hintTopLevelDomain='edu') 319 | detector.detect(text, hintTopLevelDomain='com') 320 | detector.detect(text, hintTopLevelDomain='id') 321 | 322 | def test_language_http_headers_hint(self): 323 | for detector in cld2, cld2full: 324 | for lang, text in testData: 325 | detector.detect(text, hintLanguageHTTPHeaders='mi,en') 326 | 327 | def test_debug_flags(self): 328 | for detector in cld2, cld2full: 329 | detector.detect(fr_en_Latn, debugScoreAsQuads=True) 330 | detector.detect(fr_en_Latn, debugHTML=True) 331 | detector.detect(fr_en_Latn, debugHTML=True, debugCR=True) 332 | detector.detect(fr_en_Latn, debugHTML=True, debugQuiet=True) 333 | detector.detect(fr_en_Latn, debugHTML=True, debugVerbose=True) 334 | detector.detect(fr_en_Latn, debugHTML=True, debugEcho=True) 335 | 336 | def test_unreliable(self): 337 | for detector in cld2, cld2full: 338 | isReliable, textBytesFound, details, vectors = detector.detect('interaktive infografik \xc3\xbcber videospielkonsolen', returnVectors = True) 339 | self.assertEqual(3, len(details)) 340 | 341 | if __name__ == '__main__': 342 | try: 343 | unittest.main() 344 | finally: 345 | 346 | # Confirm that cld2.DETECTED_LANGUAGES == all languages detected by 347 | # the test cases: 348 | for lang in cld2.DETECTED_LANGUAGES: 349 | # Raises KeyError if lang was never detected by the test: 350 | TestCLD.langsSeen.remove(lang) 351 | # Confirm that no languages detected by the test were not listed in cld2.DETECTED_LANGUAGES: 352 | if len(TestCLD.langsSeen) != 0: 353 | raise RuntimeError('unexpected additional languages were detected: %s' % TestCLD.langsSeen) 354 | 355 | if False: 356 | l = list(TestCLD.fullLangsSeen) 357 | l.sort() 358 | for x in l: 359 | print('PyTuple_SET_ITEM(detLangs, upto++, PyUnicode_FromString("%s"));' % x) 360 | 361 | # Confirm that cld2full.DETECTED_LANGUAGES == all languages detected by 362 | # the test cases: 363 | 364 | #print('FULL %d: %s' % (len(TestCLD.fullLangsSeen), ', '.join(TestCLD.fullLangsSeen))) 365 | for lang in cld2full.DETECTED_LANGUAGES: 366 | # Raises KeyError if lang was never detected by the test: 367 | TestCLD.fullLangsSeen.remove(lang) 368 | # Confirm that no languages detected by the test were not listed in cld2full.DETECTED_LANGUAGES: 369 | if len(TestCLD.fullLangsSeen) != 0: 370 | raise RuntimeError('unexpected additional languages were detected: %s' % TestCLD.fullLangsSeen) 371 | --------------------------------------------------------------------------------