├── .gitignore ├── HangulTests.py ├── Hangulpy.py └── README.textile /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc -------------------------------------------------------------------------------- /HangulTests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | HangulTests.py 5 | 6 | Created by Ryan Rho on 2012-01-07. 7 | Copyright (c) 2012 __MyCompanyName__. All rights reserved. 8 | """ 9 | 10 | import unittest 11 | from Hangulpy import * 12 | 13 | class HangulTests(unittest.TestCase): 14 | def setUp(self): 15 | pass 16 | 17 | ################################################################################ 18 | # Boolean Hangul functions 19 | ################################################################################ 20 | 21 | def test_is_hangul(self): 22 | hangul_letters = u'가나다라힣뷁' 23 | other_letters = u'@%漢字かんじカンジhán tựლ╹◡╹ლ' 24 | non_unicode_letters = 'abcdez$%^&* ' 25 | 26 | for letter in hangul_letters: 27 | self.assertTrue(is_hangul(letter)) 28 | 29 | for letter in other_letters: 30 | self.assertFalse(is_hangul(letter)) 31 | 32 | for letter in non_unicode_letters: 33 | self.assertFalse(is_hangul(letter)) 34 | 35 | def test_has_jongsung(self): 36 | jongsung_letters = u'강줽뷁' 37 | non_jongsung_letters = u'가너댜봬쉐' 38 | 39 | for letter in jongsung_letters: 40 | self.assertTrue(has_jongsung(letter)) 41 | 42 | for letter in non_jongsung_letters: 43 | self.assertFalse(has_jongsung(letter)) 44 | 45 | ################################################################################ 46 | # Composition & Decomposition 47 | ################################################################################ 48 | 49 | def test_compose(self): 50 | test_list = [ 51 | (u'간', (u'ㄱ', u'ㅏ', u'ㄴ')), 52 | (u'가', (u'ㄱ', u'ㅏ', u'')), 53 | (u'가', (u'ㄱ', u'ㅏ', None)), 54 | (u'뷁', (u'ㅂ', u'ㅞ', u'ㄺ')) 55 | ] 56 | 57 | for answer, combination in test_list: 58 | self.assertEqual(answer, compose(*combination)) 59 | 60 | def test_decompose(self): 61 | test_list = [ 62 | (u'간', (u'ㄱ', u'ㅏ', u'ㄴ')), 63 | (u'가', (u'ㄱ', u'ㅏ', u'')), 64 | (u'뷁', (u'ㅂ', u'ㅞ', u'ㄺ')) 65 | ] 66 | 67 | for letter, answer in test_list: 68 | self.assertEqual(answer, decompose(letter)) 69 | 70 | if __name__ == '__main__': 71 | unittest.main() -------------------------------------------------------------------------------- /Hangulpy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Hangulpy.py 5 | 6 | Copyright (C) 2012 Ryan Rho, Hyunwoo Cho 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of 9 | this software and associated documentation files (the "Software"), to deal in 10 | the Software without restriction, including without limitation the rights to 11 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 | of the Software, and to permit persons to whom the Software is furnished to do 13 | so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | """ 26 | 27 | import string 28 | 29 | ################################################################################ 30 | # Hangul Unicode Variables 31 | ################################################################################ 32 | 33 | # Code = 0xAC00 + (Chosung_index * NUM_JOONGSUNGS * NUM_JONGSUNGS) + (Joongsung_index * NUM_JONGSUNGS) + (Jongsung_index) 34 | CHOSUNGS = [u'ㄱ',u'ㄲ',u'ㄴ',u'ㄷ',u'ㄸ',u'ㄹ',u'ㅁ',u'ㅂ',u'ㅃ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅉ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ'] 35 | JOONGSUNGS = [u'ㅏ',u'ㅐ',u'ㅑ',u'ㅒ',u'ㅓ',u'ㅔ',u'ㅕ',u'ㅖ',u'ㅗ',u'ㅘ',u'ㅙ',u'ㅚ',u'ㅛ',u'ㅜ',u'ㅝ',u'ㅞ',u'ㅟ',u'ㅠ',u'ㅡ',u'ㅢ',u'ㅣ'] 36 | JONGSUNGS = [u'',u'ㄱ',u'ㄲ',u'ㄳ',u'ㄴ',u'ㄵ',u'ㄶ',u'ㄷ',u'ㄹ',u'ㄺ',u'ㄻ',u'ㄼ',u'ㄽ',u'ㄾ',u'ㄿ',u'ㅀ',u'ㅁ',u'ㅂ',u'ㅄ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ'] 37 | 38 | NUM_CHOSUNGS = 19 39 | NUM_JOONGSUNGS = 21 40 | NUM_JONGSUNGS = 28 41 | 42 | FIRST_HANGUL_UNICODE = 0xAC00 #'가' 43 | LAST_HANGUL_UNICODE = 0xD7A3 #'힣' 44 | 45 | ################################################################################ 46 | # Boolean Hangul functions 47 | ################################################################################ 48 | 49 | def is_hangul(phrase): 50 | """Check whether the phrase is Hangul. 51 | This method ignores white spaces, punctuations, and numbers. 52 | @param phrase a target string 53 | @return True if the phrase is Hangul. False otherwise.""" 54 | 55 | # If the input is only one character, test whether the character is Hangul. 56 | if len(phrase) == 1: return is_all_hangul(phrase) 57 | 58 | # Remove all white spaces, punctuations, numbers. 59 | exclude = set(string.whitespace + string.punctuation + '0123456789') 60 | phrase = ''.join(ch for ch in phrase if ch not in exclude) 61 | 62 | return is_all_hangul(phrase) 63 | 64 | def is_all_hangul(phrase): 65 | """Check whether the phrase contains all Hangul letters 66 | @param phrase a target string 67 | @return True if the phrase only consists of Hangul. False otherwise.""" 68 | 69 | for unicode_value in map(lambda letter:ord(letter), phrase): 70 | if unicode_value < FIRST_HANGUL_UNICODE or unicode_value > LAST_HANGUL_UNICODE: 71 | # Check whether the letter is chosungs, joongsungs, or jongsungs. 72 | if unicode_value not in map(lambda v: ord(v), CHOSUNGS + JOONGSUNGS + JONGSUNGS[1:]): 73 | return False 74 | return True 75 | 76 | def has_jongsung(letter): 77 | """Check whether this letter contains Jongsung""" 78 | if len(letter) != 1: 79 | raise Exception('The target string must be one letter.') 80 | if not is_hangul(letter): 81 | raise NotHangulException('The target string must be Hangul') 82 | 83 | unicode_value = ord(letter) 84 | return (unicode_value - FIRST_HANGUL_UNICODE) % NUM_JONGSUNGS > 0 85 | 86 | def has_batchim(letter): 87 | """This method is the same as has_jongsung()""" 88 | return has_jongsung(letter) 89 | 90 | def has_approximant(letter): 91 | """Approximant makes complex vowels, such as ones starting with y or w. 92 | In Korean there is a unique approximant euㅡ making uiㅢ, but ㅢ does not make many irregularities.""" 93 | if len(letter) != 1: 94 | raise Exception('The target string must be one letter.') 95 | if not is_hangul(letter): 96 | raise NotHangulException('The target string must be Hangul') 97 | 98 | jaso = decompose(letter) 99 | diphthong = (2, 3, 6, 7, 9, 10, 12, 14, 15, 17) 100 | # [u'ㅑ',u'ㅒ',',u'ㅕ',u'ㅖ',u'ㅘ',u'ㅙ',u'ㅛ',u'ㅝ',u'ㅞ',u'ㅠ'] 101 | # excluded 'ㅢ' because y- and w-based complex vowels are irregular. 102 | # vowels with umlauts (ㅐ, ㅔ, ㅚ, ㅟ) are not considered complex vowels. 103 | return jaso[1] in diphthong 104 | 105 | ################################################################################ 106 | # Decomposition & Combination 107 | ################################################################################ 108 | 109 | def compose(chosung, joongsung, jongsung=u''): 110 | """This function returns a Hangul letter by composing the specified chosung, joongsung, and jongsung. 111 | @param chosung 112 | @param joongsung 113 | @param jongsung the terminal Hangul letter. This is optional if you do not need a jongsung.""" 114 | 115 | if jongsung is None: jongsung = u'' 116 | 117 | try: 118 | chosung_index = CHOSUNGS.index(chosung) 119 | joongsung_index = JOONGSUNGS.index(joongsung) 120 | jongsung_index = JONGSUNGS.index(jongsung) 121 | except Exception, e: 122 | raise NotHangulException('No valid Hangul character can be generated using given combination of chosung, joongsung, and jongsung.') 123 | 124 | return unichr(0xAC00 + chosung_index * NUM_JOONGSUNGS * NUM_JONGSUNGS + joongsung_index * NUM_JONGSUNGS + jongsung_index) 125 | 126 | def decompose(hangul_letter): 127 | """This function returns letters by decomposing the specified Hangul letter.""" 128 | 129 | if len(hangul_letter) < 1: 130 | raise NotLetterException('') 131 | elif not is_hangul(hangul_letter): 132 | raise NotHangulException('') 133 | 134 | code = ord(hangul_letter) - FIRST_HANGUL_UNICODE 135 | jongsung_index = code % NUM_JONGSUNGS 136 | code /= NUM_JONGSUNGS 137 | joongsung_index = code % NUM_JOONGSUNGS 138 | code /= NUM_JOONGSUNGS 139 | chosung_index = code 140 | 141 | return (CHOSUNGS[chosung_index], JOONGSUNGS[joongsung_index], JONGSUNGS[jongsung_index]) 142 | 143 | ################################################################################ 144 | # Josa functions 145 | ################################################################################ 146 | 147 | def josa_en(word): 148 | """add josa either '은' or '는' at the end of this word""" 149 | word = word.strip() 150 | if not is_hangul(word): raise NotHangulException('') 151 | 152 | last_letter = word[-1] 153 | josa = u'은' if has_jongsung(last_letter) else u'는' 154 | return word + josa 155 | 156 | def josa_eg(word): 157 | """add josa either '이' or '가' at the end of this word""" 158 | word = word.strip() 159 | if not is_hangul(word): raise NotHangulException('') 160 | 161 | last_letter = word[-1] 162 | josa = u'이' if has_jongsung(last_letter) else u'가' 163 | return word + josa 164 | 165 | def josa_el(word): 166 | """add josa either '을' or '를' at the end of this word""" 167 | word = word.strip() 168 | if not is_hangul(word): raise NotHangulException('') 169 | 170 | last_letter = word[-1] 171 | josa = u'을' if has_jongsung(last_letter) else u'를' 172 | return word + josa 173 | 174 | def josa_ro(word): 175 | """add josa either '으로' or '로' at the end of this word""" 176 | word = word.strip() 177 | if not is_hangul(word): raise NotHangulException('') 178 | 179 | last_letter = word[-1] 180 | if not has_jongsung(last_letter): 181 | josa = u'로' 182 | elif (ord(last_letter) - FIRST_HANGUL_UNICODE) % NUM_JONGSUNGS == 9: # ㄹ 183 | josa = u'로' 184 | else: 185 | josa = u'으로' 186 | 187 | return word + josa 188 | 189 | def josa_gwa(word): 190 | """add josa either '과' or '와' at the end of this word""" 191 | word = word.strip() 192 | if not is_hangul(word): raise NotHangulException('') 193 | 194 | last_letter = word[-1] 195 | josa = u'과' if has_jongsung(last_letter) else u'와' 196 | return word + josa 197 | 198 | def josa_ida(word): 199 | """add josa either '이다' or '다' at the end of this word""" 200 | word = word.strip() 201 | if not is_hangul(word): raise NotHangulException('') 202 | 203 | last_letter = word[-1] 204 | josa = u'이다' if has_jongsung(last_letter) else u'다' 205 | return word + josa 206 | 207 | ################################################################################ 208 | # Prefixes and suffixes 209 | # Practice area; need more organization 210 | ################################################################################ 211 | 212 | def add_ryul(word): 213 | """add suffix either '률' or '율' at the end of this word""" 214 | word = word.strip() 215 | if not is_hangul(word): raise NotHangulException('') 216 | 217 | last_letter = word[-1] 218 | if not has_jongsung(last_letter): 219 | ryul = u'율' 220 | elif (ord(last_letter) - FIRST_HANGUL_UNICODE) % NUM_JONGSUNGS == 4: # ㄴ 221 | ryul = u'율' 222 | else: 223 | ryul = u'률' 224 | 225 | return word + ryul 226 | 227 | ################################################################################ 228 | # The formatter, or ultimately, a template system 229 | # Practice area; need more organization 230 | ################################################################################ 231 | 232 | def ili(word): 233 | """convert {가} or {이} to their correct respective particles automagically.""" 234 | word = word.strip() 235 | if not is_hangul(word): raise NotHangulException('') 236 | 237 | last_letter = word[word.find(u'{가}')-1] 238 | word = word.replace(u'{가}', (u'이' if has_jongsung(last_letter) else u'가')) 239 | 240 | last_letter = word[word.find(u'{이}')-1] 241 | word = word.replace(u'{이}', (u'이' if has_jongsung(last_letter) else u'가')) 242 | return word 243 | 244 | ################################################################################ 245 | # Exceptions 246 | ################################################################################ 247 | 248 | class NotHangulException(Exception): 249 | pass 250 | 251 | class NotLetterException(Exception): 252 | pass 253 | 254 | class NotWordException(Exception): 255 | pass 256 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h1. Hangulpy 2 | 3 | h2. Overview 4 | 5 | Hangulpy is a python module that handles Hangul Unicode features. 6 | 7 | h2. License 8 | 9 | MIT License 10 | 11 | h1. 한글파이 12 | 13 | h2. 개요 14 | 15 | 한글파이는 한글 유니코드 기능을 제공하는 파이썬 모듈입니다. 16 | 17 | h3. 자동 조사/접미사 첨부 18 | 19 | 정해진 규격을 지키면 주격, 도구격, 서술격 등 모든 조사를 앞 단어에 알맞게 자동으로 바꿔 줍니다. 20 | 21 | h3. 자모 분해 및 결합 22 | 23 | 겹받침, 반자음 + 모음, 'ㅣ' 역행동화가 일어난 모음 등을 자동화에 필요한 상황에 맞게 쓸 수 있도록 분해하고 결합합니다. 24 | 25 | h3. 두음법칙을 적용한 한자 음 읽기 (미구현) 26 | 27 | 단어 속 위치에 따라 다르게 소리나는 한자를 자동으로 잡아 줍니다. (미구현) 28 | 29 | h2. 라이센스 30 | 31 | MIT License --------------------------------------------------------------------------------