├── README.md └── lang_code.py /README.md: -------------------------------------------------------------------------------- 1 | lang-code 2 | ========= 3 | This script is used for transcribing ISO-639 language code, in this version, it has been designed to represent most of the major languages of the world. Based on ISO-639-1, language code could be transcribed between ISO-639-1 and ISO-639-3, note that not all language code in ISO-639-3 is listed here. 4 | 5 | - Part 1 (ISO 639-1:2002) provides a 2 letter code that has been designed to represent most of the major languages of the world. 6 | - Part 3 (ISO 639-3:2007) provides a 3 letter code and aims to give as complete a listing of languages as possible, including living, extinct and ancient languages. 7 | 8 | More information about ISO-639: 9 | - http://www.iso.org/iso/home/standards/language_codes.htm 10 | - http://www-01.sil.org/iso639-3/codes.asp 11 | 12 | #Script: 13 | 14 | Class: 15 | - Lang_Code :Initialize class by inputting original and target language code. 16 | 17 | Method: 18 | - changeCode(string) : change language code from original code to target code. 19 | - get_noDetected(): return number and code of no detected language code. 20 | 21 | #Exemple: 22 | 23 | import lang-code 24 | f = lang-code.Lang_Code("ISO-639-1","ISO-639-3") 25 | code1 = "fr" 26 | code2 = f.changeCode(code1) 27 | print code2 28 | ==>"fra" 29 | -------------------------------------------------------------------------------- /lang_code.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | class Lang_Code: 5 | def __init__(self, origine, target): 6 | self.origine = self.__control_code(origine) 7 | self.target = self.__control_code(target) 8 | self.miss = 0 # number of lang can't be detected 9 | self.misslang = [] 10 | self.langs = [("aa", "aar"), 11 | ("ab", "abk"), 12 | ("ae", "ave"), 13 | ("af", "afr"), 14 | ("ak", "aka"), 15 | ("am", "amh"), 16 | ("an", "arg"), 17 | ("ar", "ara"), 18 | ("as", "asm"), 19 | ("av", "ava"), 20 | ("ay", "aym"), 21 | ("az", "aze"), 22 | ("ba", "bak"), 23 | ("be", "bel"), 24 | ("bg", "bul"), 25 | ("bh", "und"), 26 | ("bi", "bis"), 27 | ("bm", "bam"), 28 | ("bn", "ben"), 29 | ("bo", "bod"), 30 | ("br", "bre"), 31 | ("bs", "bos"), 32 | ("ca", "cat"), 33 | ("ce", "che"), 34 | ("ch", "cha"), 35 | ("co", "cos"), 36 | ("cr", "cre"), 37 | ("cs", "ces"), 38 | ("cu", "chu"), 39 | ("cv", "chv"), 40 | ("cy", "cym"), 41 | ("da", "dan"), 42 | ("de", "deu"), 43 | ("dv", "div"), 44 | ("dz", "dzo"), 45 | ("ee", "ewe"), 46 | ("el", "ell"), 47 | ("en", "eng"), 48 | ("eo", "epo"), 49 | ("es", "spa"), 50 | ("et", "est"), 51 | ("eu", "eus"), 52 | ("fa", "fas"), 53 | ("ff", "ful"), 54 | ("fi", "fin"), 55 | ("fj", "fij"), 56 | ("fo", "fao"), 57 | ("fr", "fra"), 58 | ("fy", "fry"), 59 | ("ga", "gle"), 60 | ("gd", "gla"), 61 | ("gl", "glg"), 62 | ("gn", "grn"), 63 | ("gu", "guj"), 64 | ("gv", "glv"), 65 | ("ha", "hau"), 66 | ("he", "heb"), 67 | ("hi", "hin"), 68 | ("ho", "hmo"), 69 | ("hr", "hrv"), 70 | ("ht", "hat"), 71 | ("hu", "hun"), 72 | ("hy", "hye"), 73 | ("hz", "her"), 74 | ("ia", "ina"), 75 | ("id", "ind"), 76 | ("ie", "ile"), 77 | ("ig", "ibo"), 78 | ("ii", "iii"), 79 | ("ik", "ipk"), 80 | ("io", "ido"), 81 | ("is", "isl"), 82 | ("it", "ita"), 83 | ("iu", "iku"), 84 | ("ja", "jpn"), 85 | ("jv", "jav"), 86 | ("ka", "kat"), 87 | ("kg", "kon"), 88 | ("ki", "kik"), 89 | ("kj", "kua"), 90 | ("kk", "kaz"), 91 | ("kl", "kal"), 92 | ("km", "khm"), 93 | ("kn", "kan"), 94 | ("ko", "kor"), 95 | ("kr", "kau"), 96 | ("ks", "kas"), 97 | ("ku", "kur"), 98 | ("kv", "kom"), 99 | ("kw", "cor"), 100 | ("ky", "kir"), 101 | ("la", "lat"), 102 | ("lb", "ltz"), 103 | ("lg", "lug"), 104 | ("li", "lim"), 105 | ("ln", "lin"), 106 | ("lo", "lao"), 107 | ("lt", "lit"), 108 | ("lu", "lub"), 109 | ("lv", "lav"), 110 | ("mg", "mlg"), 111 | ("mh", "mah"), 112 | ("mi", "mri"), 113 | ("mk", "mkd"), 114 | ("ml", "mal"), 115 | ("mn", "mon"), 116 | ("mr", "mar"), 117 | ("ms", "msa"), 118 | ("mt", "mlt"), 119 | ("my", "mya"), 120 | ("na", "nau"), 121 | ("nb", "nob"), 122 | ("nd", "nde"), 123 | ("ne", "nep"), 124 | ("ng", "ndo"), 125 | ("nl", "nld"), 126 | ("nn", "nno"), 127 | ("no", "nor"), 128 | ("nr", "nbl"), 129 | ("nv", "nav"), 130 | ("ny", "nya"), 131 | ("oc", "oci"), 132 | ("oj", "oji"), 133 | ("om", "orm"), 134 | ("or", "ori"), 135 | ("os", "oss"), 136 | ("pa", "pan"), 137 | ("pi", "pli"), 138 | ("pl", "pol"), 139 | ("ps", "pus"), 140 | ("pt", "por"), 141 | ("qu", "que"), 142 | ("rm", "roh"), 143 | ("rn", "run"), 144 | ("ro", "ron"), 145 | ("ru", "rus"), 146 | ("rw", "kin"), 147 | ("sa", "san"), 148 | ("sc", "srd"), 149 | ("sd", "snd"), 150 | ("se", "sme"), 151 | ("sg", "sag"), 152 | ("sh", "hbs"), 153 | ("si", "sin"), 154 | ("sk", "slk"), 155 | ("sl", "slv"), 156 | ("sm", "smo"), 157 | ("sn", "sna"), 158 | ("so", "som"), 159 | ("sq", "sqi"), 160 | ("sr", "srp"), 161 | ("ss", "ssw"), 162 | ("st", "sot"), 163 | ("su", "sun"), 164 | ("sv", "swe"), 165 | ("sw", "swa"), 166 | ("ta", "tam"), 167 | ("te", "tel"), 168 | ("tg", "tgk"), 169 | ("th", "tha"), 170 | ("ti", "tir"), 171 | ("tk", "tuk"), 172 | ("tl", "tgl"), 173 | ("tn", "tsn"), 174 | ("to", "ton"), 175 | ("tr", "tur"), 176 | ("ts", "tso"), 177 | ("tt", "tat"), 178 | ("tw", "twi"), 179 | ("ty", "tah"), 180 | ("ug", "uig"), 181 | ("uk", "ukr"), 182 | ("ur", "urd"), 183 | ("uz", "uzb"), 184 | ("ve", "ven"), 185 | ("vi", "vie"), 186 | ("vo", "vol"), 187 | ("wa", "wln"), 188 | ("wo", "wol"), 189 | ("xh", "xho"), 190 | ("yi", "yid"), 191 | ("yo", "yor"), 192 | ("za", "zha"), 193 | ("zh", "zho"), 194 | ("zu", "zul"), 195 | ("und","und")] 196 | 197 | self.dict = {} 198 | self.__make_dict() 199 | 200 | def __control_code(self,coding): 201 | if coding == "ISO-639-1" or coding == "ISO639-1": 202 | coding = "ISO-639-1" 203 | elif coding == "ISO-639-3" or coding == "ISO639-3": 204 | coding = "ISO-639-3" 205 | else: 206 | raise "InputCodeError" 207 | return coding 208 | 209 | 210 | def __make_dict(self): 211 | if self.origine == "ISO-639-1": 212 | for x,y in self.langs: 213 | self.dict[x] = y 214 | elif self.origine == "ISO-639-3": 215 | for x,y in self.langs: 216 | self.dict[y] = x 217 | 218 | def set_targetCode(self, target): 219 | self.target = self.__control_code(target) 220 | self.dict = {} 221 | self.__make_dict() 222 | 223 | def set_origineCode(self, origine): 224 | self.origine = self.__control_code(origine) 225 | self.dict = {} 226 | self.__make_dict() 227 | 228 | def changeCode(self,string): 229 | try: 230 | return self.dict[string] 231 | except: 232 | self.miss += 1 233 | self.misslang.append(string) 234 | return "und" 235 | 236 | def get_noDetected(self): 237 | return self.miss, "_".join(self.misslang) 238 | 239 | --------------------------------------------------------------------------------