├── .gitignore ├── LICENSE ├── README.mkd ├── c ├── Makefile ├── README.mkd ├── c.make ├── cp2py.c ├── hanzi-bindata.c ├── hanzi-sqlite.c ├── hanzi.c ├── hanzi.h ├── test │ ├── Makefile │ ├── test.c │ ├── utf8-test-mixed.txt │ └── utf8-test.txt ├── unicode.c ├── unicode.h └── util.h ├── data ├── README.md └── Unihan │ ├── block1-codepoint2pinyin.txt │ ├── block1-pinyin-statistic.rb │ ├── create-bindata.rb │ ├── create-sqlitedb.rb │ ├── extract-block1.rb │ ├── generate-data.sh │ ├── process-unihan-readings.rb │ ├── test-db.rb │ ├── test-process.rb │ └── unihan-codepoint2pinyin.txt └── objective-c ├── Hanzi2Pinyin ├── Hanzi2Pinyin.xcodeproj │ ├── project.pbxproj │ └── project.xcworkspace │ │ └── contents.xcworkspacedata ├── Hanzi2Pinyin │ ├── Hanzi2Pinyin-Info.plist │ ├── Hanzi2Pinyin-Prefix.pch │ ├── Hanzi2Pinyin.h │ ├── Hanzi2Pinyin.m │ ├── en.lproj │ │ └── InfoPlist.strings │ └── pinyin.dat └── Hanzi2PinyinTests │ ├── Hanzi2PinyinTests-Info.plist │ ├── Hanzi2PinyinTests.h │ ├── Hanzi2PinyinTests.m │ └── en.lproj │ └── InfoPlist.strings └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.dat 3 | *.db 4 | xcuserdata 5 | .DS_store 6 | *DerivedData 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Chen Yufei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.mkd: -------------------------------------------------------------------------------- 1 | This library converts Chinese character to pinyin. 2 | 3 | Limitations: 4 | 5 | 1. Does not handle polyphone (now). 6 | 2. The pinyin data is obtained from Unihan database. Maybe not complete or 7 | incorrect for some characters. But I can't find more accurate data from other 8 | resources. 9 | -------------------------------------------------------------------------------- /c/Makefile: -------------------------------------------------------------------------------- 1 | CC = llvm-gcc 2 | LDFLAGS = -liconv 3 | CFLAGS = -g -std=gnu99 4 | 5 | all: cp2py 6 | 7 | cp2py: cp2py.o hanzi.o hanzi-bindata.o unicode.o 8 | $(call cc-link) 9 | 10 | include c.make 11 | 12 | clean: 13 | -rm -f *.o 14 | -rm -f $(TEST_RUN) 15 | -------------------------------------------------------------------------------- /c/README.mkd: -------------------------------------------------------------------------------- 1 | There're 2 implementations to lookup pinyin for a given unicode code point. 2 | 3 | 1. `hanzi-sqlite.c` looks up the pinyin in a sqlite3 database 4 | 2. `hanzi-bindata.c` mmaps the pinyin data array in a file, and use the code 5 | point as index to the array 6 | 7 | The second approach is much simpler and should be faster. 8 | 9 | The data can be created by scripts `data/Unihan/create-{bindata,sqlitedb}.rb`. 10 | **You need to modify the data path in the C implementation files**. 11 | 12 | `unicode.c` contains code to convert UTF-8 string to UTF-32 string, which uses 13 | libiconv. (I've written a naive UTF converter previously, but it's better to 14 | examine and use the ["no longer official" example code](http://gears.googlecode.com/svn/trunk/third_party/convert_utf/ConvertUTF.c). 15 | -------------------------------------------------------------------------------- /c/c.make: -------------------------------------------------------------------------------- 1 | # Include this file at the end of project specific Makefile 2 | 3 | # Use make V=1 to see verbose output 4 | quiet-command = $(if $(V),$1,$(if $2,@echo $2 && $1, @$1)) 5 | 6 | # Pass additional options as the first argument. 7 | # We can use also use target/pattern specific Variables to redefine LDFLAGS, 8 | # but that's not clear and flexible as this approach. 9 | cc-link = $(call quiet-command, $(CC) $^ $(LDFLAGS) -o $@ $1, " LINK $@") 10 | cxx-link = $(call quiet-command, $(CXX) $^ $(CXXLDFLAGS) -o $@ $1, " LINK $@") 11 | 12 | %.o: %.c 13 | $(call quiet-command, $(CC) $(CFLAGS) -c -o $@ $<, " CC $@") 14 | %.o: %.cpp 15 | $(call quiet-command, $(CXX) $(CXXFLAGS) -c -o $@ $<, " CXX $@") 16 | 17 | # gcc command line options meaning 18 | # -MM ignores system headers 19 | # -MT to specify the target 20 | # Generate rules like "foo.d foo.o: foo.c foo.h", so foo.d will get updated 21 | # each time foo.c/h is changed. 22 | .%.d: %.c 23 | $(call quiet-command, $(CC) -MM -MF $@ -MT $@ -MT $(subst .c,.o, $<) $<, " DEP $@") 24 | .%.dpp: %.cpp 25 | $(call quiet-command, $(CXX) -MM -MF $@ -MT $@ -MT $(subst .cpp,.o, $<) $<, " DEP $@") 26 | 27 | dependencies = $(patsubst %.c, .%.d, $(wildcard *.c)) 28 | dependencies += $(patsubst %.cpp, .%.dpp, $(wildcard *.cpp)) 29 | # It's possible to use foreach to include files in several directories 30 | # dependencies = $(patsubst %.c, $(DEPDIR)/%.d, $(foreach dir, ., $(wildcard $(dir)/*.c))))) 31 | 32 | ifeq ($(MAKECMDGOALS), clean) 33 | else 34 | ifeq ($(MAKECMDGOALS), distclean) 35 | else 36 | -include $(dependencies) 37 | endif 38 | endif 39 | 40 | # Make these file as final target 41 | %.c: 42 | %.cpp: 43 | %.h: 44 | %.hpp: 45 | %.o: 46 | 47 | # vim: ft=make 48 | -------------------------------------------------------------------------------- /c/cp2py.c: -------------------------------------------------------------------------------- 1 | #include "hanzi.h" 2 | #include 3 | #include 4 | 5 | int main(int argc, char* argv[]) 6 | { 7 | if (argc < 2) { 8 | printf("Usage: %s \n", argv[0]); 9 | exit(1); 10 | } 11 | 12 | char *pinyin = hz2pinyin(argv[1], " "); 13 | printf("%s\n", pinyin); 14 | free(pinyin); 15 | 16 | return 0; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /c/hanzi-bindata.c: -------------------------------------------------------------------------------- 1 | #include "hanzi.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | #include 6 | 7 | /* I use a 2-byte integer to store the index in the pinyin_tbl for each Hanzi. 8 | * This representation is compact and is easy to search (since the pinyin table 9 | * is just a array). 10 | * If need to support polyphone, I can use a unique index for each kind of 11 | * possible pinyin combination. In that case, the pinyin_tbl may be too big to 12 | * store directly in C code. 13 | */ 14 | 15 | static const char *data_path = 16 | "/Users/alex/programming/hanzi2pinyin/data/pinyin.dat"; 17 | 18 | static uint16_t *pinyin_data; 19 | 20 | /* Obtained from data/Unihan/block1-pinyin-statistic.rb */ 21 | static const char *pinyin_tbl[] = { 22 | "a", "ai", "an", "ang", "ao", "ba", "bai", "ban", "bang", "bao", "bei", 23 | "ben", "beng", "bi", "bian", "biao", "bie", "bin", "bing", "bo", "bu", "ca", 24 | "cai", "can", "cang", "cao", "ce", "cen", "ceng", "cha", "chai", "chan", 25 | "chang", "chao", "che", "chen", "cheng", "chi", "chong", "chou", "chu", 26 | "chua", "chuai", "chuan", "chuang", "chui", "chun", "chuo", "ci", "cong", 27 | "cou", "cu", "cuan", "cui", "cun", "cuo", "da", "dai", "dan", "dang", "dao", 28 | "de", "den", "deng", "di", "dia", "dian", "diao", "die", "ding", "diu", 29 | "dong", "dou", "du", "duan", "dui", "dun", "duo", "e", "ei", "en", "eng", 30 | "er", "fa", "fan", "fang", "fei", "fen", "feng", "fiao", "fo", "fou", "fu", 31 | "ga", "gai", "gan", "gang", "gao", "ge", "gei", "gen", "geng", "gong", 32 | "gou", "gu", "gua", "guai", "guan", "guang", "gui", "gun", "guo", "ha", 33 | "hai", "han", "hang", "hao", "he", "hei", "hen", "heng", "hm", "hong", 34 | "hou", "hu", "hua", "huai", "huan", "huang", "hui", "hun", "huo", "ji", 35 | "jia", "jian", "jiang", "jiao", "jie", "jin", "jing", "jiong", "jiu", "ju", 36 | "juan", "jue", "jun", "ka", "kai", "kan", "kang", "kao", "ke", "ken", 37 | "keng", "kong", "kou", "ku", "kua", "kuai", "kuan", "kuang", "kui", "kun", 38 | "kuo", "la", "lai", "lan", "lang", "lao", "le", "lei", "leng", "li", "lia", 39 | "lian", "liang", "liao", "lie", "lin", "ling", "liu", "long", "lou", "lu", 40 | "luan", "lun", "luo", "lv", "lve", "m", "ma", "mai", "man", "mang", "mao", 41 | "me", "mei", "men", "meng", "mi", "mian", "miao", "mie", "min", "ming", 42 | "miu", "mo", "mou", "mu", "n", "na", "nai", "nan", "nang", "nao", "ne", 43 | "nei", "nen", "neng", "ni", "nian", "niang", "niao", "nie", "nin", "ning", 44 | "niu", "nong", "nou", "nu", "nuan", "nun", "nuo", "nv", "nve", "o", "ou", 45 | "pa", "pai", "pan", "pang", "pao", "pei", "pen", "peng", "pi", "pian", 46 | "piao", "pie", "pin", "ping", "po", "pou", "pu", "qi", "qia", "qian", 47 | "qiang", "qiao", "qie", "qin", "qing", "qiong", "qiu", "qu", "quan", "que", 48 | "qun", "r", "ran", "rang", "rao", "re", "ren", "reng", "ri", "rong", "rou", 49 | "ru", "rua", "ruan", "rui", "run", "ruo", "sa", "sai", "san", "sang", "sao", 50 | "se", "sen", "seng", "sha", "shai", "shan", "shang", "shao", "she", "shen", 51 | "sheng", "shi", "shou", "shu", "shua", "shuai", "shuan", "shuang", "shui", 52 | "shun", "shuo", "si", "song", "sou", "su", "suan", "sui", "sun", "suo", 53 | "ta", "tai", "tan", "tang", "tao", "te", "teng", "ti", "tian", "tiao", 54 | "tie", "ting", "tong", "tou", "tu", "tuan", "tui", "tun", "tuo", "wa", 55 | "wai", "wan", "wang", "wei", "wen", "weng", "wo", "wu", "xi", "xia", "xian", 56 | "xiang", "xiao", "xie", "xin", "xing", "xiong", "xiu", "xu", "xuan", "xue", 57 | "xun", "ya", "yan", "yang", "yao", "ye", "yi", "yin", "ying", "yo", "yong", 58 | "you", "yu", "yuan", "yue", "yun", "za", "zai", "zan", "zang", "zao", "ze", 59 | "zei", "zen", "zeng", "zha", "zhai", "zhan", "zhang", "zhao", "zhe", "zhen", 60 | "zheng", "zhi", "zhong", "zhou", "zhu", "zhua", "zhuai", "zhuan", "zhuang", 61 | "zhui", "zhun", "zhuo", "zi", "zong", "zou", "zu", "zuan", "zui", "zun", 62 | "zuo" 63 | }; 64 | 65 | static int init_data() { 66 | CALL_ONCE(0); 67 | 68 | FILE *data_file; 69 | data_file = fopen(data_path, "rb"); 70 | if (!data_file) { 71 | fprintf(stderr, "Can't open pinyin data file\n"); 72 | return -1; 73 | } 74 | 75 | fseek(data_file, 0, SEEK_END); 76 | size_t length = ftell(data_file); 77 | fseek(data_file, 0, SEEK_SET); 78 | 79 | int fd = fileno(data_file); 80 | off_t offset = 0; 81 | pinyin_data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, offset); 82 | if (pinyin_data == MAP_FAILED) { 83 | fprintf(stderr, "Can't mmap pinyin data file\n"); 84 | fclose(data_file); 85 | pinyin_data = NULL; 86 | return -1; 87 | } 88 | 89 | return 0; 90 | } 91 | 92 | const char *hz2pinyin_codepoint(uint32_t cp) { 93 | init_data(); 94 | 95 | if (! pinyin_data) { 96 | return NULL; 97 | } 98 | 99 | if (hz_is_hanzi(cp)) { 100 | uint16_t id = pinyin_data[cp - HANZI_START_CODEPOINT]; 101 | if (id == 0xFFFF) 102 | return NULL; 103 | else 104 | return pinyin_tbl[id]; 105 | } else 106 | return NULL; 107 | } 108 | -------------------------------------------------------------------------------- /c/hanzi-sqlite.c: -------------------------------------------------------------------------------- 1 | #include "hanzi.h" 2 | #include "util.h" 3 | #include "sqlite3.h" 4 | #include 5 | #include 6 | 7 | static sqlite3 *db; 8 | static sqlite3_stmt *query; 9 | /* The sqlite3 database is created by data/Unihan/create-sqlitedb.rb */ 10 | static const char *db_path = 11 | "/Users/alex/programming/hanzi2pinyin/data/codepoint2pinyin.db"; 12 | static const char *query_string = 13 | "SELECT pinyin FROM codepoint2pinyin WHERE codepoint = ?"; 14 | 15 | static int pinyin_db_init(void) { 16 | CALL_ONCE(0); 17 | 18 | int err = sqlite3_open_v2(db_path, &db, SQLITE_OPEN_READONLY, NULL); 19 | if (err != SQLITE_OK) { 20 | fprintf(stderr, "error opening database: %d\n", err); 21 | goto error; 22 | } 23 | 24 | err = sqlite3_prepare_v2(db, query_string, -1, &query, NULL); 25 | if (err != SQLITE_OK) { 26 | fprintf(stderr, "error preparing statement: %d\n", err); 27 | goto error; 28 | } 29 | 30 | return 0; 31 | 32 | error: 33 | sqlite3_close(db); 34 | db = NULL; 35 | return -1; 36 | } 37 | 38 | static int check_sqlite_error(int err, const char *msg) { 39 | if (err == SQLITE_ERROR || err == SQLITE_MISUSE) { 40 | fprintf(stderr, "%s: %s\n", msg, sqlite3_errmsg(db)); 41 | return 1; 42 | } 43 | } 44 | 45 | const char *hz2pinyin_codepoint(uint32_t cp) { 46 | int err; 47 | 48 | pinyin_db_init(); 49 | if (!db) 50 | return NULL; 51 | 52 | /* Reset for use. 53 | * XXX This will release memory returned by previous call to column access 54 | * function. */ 55 | sqlite3_reset(query); 56 | 57 | err = sqlite3_bind_int(query, 1, cp); 58 | if (check_sqlite_error(err, "bind_int")) 59 | return NULL; 60 | 61 | /* Does not handle other cases like SQLITE_BUSY now. May need to retry 62 | * on those errors. */ 63 | err = sqlite3_step(query); 64 | if (check_sqlite_error(err, "step")) 65 | return NULL; 66 | 67 | if (err != SQLITE_ROW) 68 | return NULL; 69 | 70 | const char *py = sqlite3_column_text(query, 0); 71 | 72 | return py; 73 | } 74 | 75 | -------------------------------------------------------------------------------- /c/hanzi.c: -------------------------------------------------------------------------------- 1 | #include "hanzi.h" 2 | // #define DEBUG 3 | #include "util.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static const int PINYIN_BUF_SIZE = 128; 10 | 11 | static inline bool need_separater(uint32_t prevcp, uint32_t curcp) { 12 | bool r = false; 13 | if (hz_is_hanzi(curcp)) { 14 | r = isspace(prevcp) ? false : true; 15 | } else if (isspace(curcp)) { 16 | r = false; 17 | } else { 18 | r = hz_is_hanzi(prevcp) ? true : false; 19 | } 20 | return r; 21 | } 22 | 23 | char *hz2pinyin(const char *hanzi, const char *sep) { 24 | /* This function is too large. */ 25 | size_t n; 26 | UTF32 *cp; 27 | 28 | int bufsize = PINYIN_BUF_SIZE; 29 | char *out = (char *)calloc(1, bufsize); 30 | char *out_ptr = out; 31 | int out_nleft = bufsize; /* How many bytes left in the out buffer. */ 32 | 33 | /* First convert the string to unicode codepoint */ 34 | cp = hz_utf8_to_utf32(hanzi, &n); 35 | if (!cp) { 36 | DPRINTF("conversion to codepoint failed\n"); 37 | return NULL; 38 | } 39 | 40 | int ret = 0; 41 | /* The first character needs no separator, so initiate prevcp as a space. */ 42 | uint32_t prevcp = ' '; 43 | 44 | /* Convert codepoint to pinyin one by one */ 45 | for (int i = 0; i < n; i++) { 46 | redo: 47 | /* For ASCII, just include it in the the final string. */ 48 | if (cp[i] < 256) { 49 | /* Add sperator between Chinese and English */ 50 | if (need_separater(prevcp, cp[i])) 51 | ret = snprintf(out_ptr, out_nleft, "%s%c", sep, cp[i]); 52 | else 53 | ret = snprintf(out_ptr, out_nleft, "%c", cp[i]); 54 | } else { 55 | const char *py = hz2pinyin_codepoint(cp[i]); 56 | if (!py) { 57 | DPRINTF("codepoint %#x doesn't have pinyin", cp[i]); 58 | goto error; 59 | } 60 | 61 | DPRINTF("%x %s ", cp[i], py); 62 | 63 | /* When previous character is chinese, add seperator. */ 64 | if (need_separater(prevcp, cp[i])) { 65 | ret = snprintf(out_ptr, out_nleft, "%s%s", sep, py); 66 | } else { 67 | ret = snprintf(out_ptr, out_nleft, "%s", py); 68 | } 69 | } 70 | /* Not enough space left in the buffer. Remeber snprinf need 1 byte to 71 | store \0, so if ret == out_nleft, out buffer has 1 less byte. */ 72 | if (ret >= out_nleft) { 73 | DPRINTF("not enough space in outbuffer, str: %s", out); 74 | bufsize += PINYIN_BUF_SIZE; 75 | char *newbuf = (char *)realloc(out, bufsize); 76 | 77 | if (!newbuf) { 78 | DPRINTF("Out of memory"); 79 | goto error; 80 | } 81 | 82 | int bytes_written = out_ptr - out; 83 | out = newbuf; 84 | out_ptr = out + bytes_written; 85 | out_nleft += PINYIN_BUF_SIZE; 86 | DPRINTF("new bufsize %d, bytes written %d, str: %s", bufsize, 87 | bytes_written, out); 88 | goto redo; 89 | } 90 | 91 | prevcp = cp[i]; 92 | out_ptr += ret; 93 | DPRINTF("bytes written %ld, str: %s", out_ptr - out, out); 94 | out_nleft -= ret; 95 | } 96 | free(cp); 97 | return out; 98 | 99 | error: 100 | free(cp); 101 | free(out); 102 | return NULL; 103 | } 104 | -------------------------------------------------------------------------------- /c/hanzi.h: -------------------------------------------------------------------------------- 1 | #ifndef _HANZI_H 2 | #define _HANZI_H 3 | 4 | #include "unicode.h" 5 | #include 6 | 7 | /* Only block1 in Unihan is covered. */ 8 | static const uint32_t HANZI_START_CODEPOINT = 0x4E00; 9 | static const uint32_t HANZI_END_CODEPOINT = 0x9FFF; 10 | 11 | static inline int hz_is_hanzi(uint32_t cp) { 12 | return (HANZI_START_CODEPOINT <= cp && cp <= HANZI_END_CODEPOINT); 13 | } 14 | 15 | /* XXX In the implementation of hanzi_sqlite.c, the memory returned is managed 16 | * by sqlite3. Calling this function again will destroy previously returned 17 | * data. 18 | * The implementation in hanzi_bindata.c does not have this problem. */ 19 | const char *hz2pinyin_codepoint(uint32_t cp); 20 | 21 | /* Return pinyin of the give hanzi string. Pinyin for each character is 22 | separated by the given sep. Memory is allocated as needed. 23 | 24 | Note you can only mix 8bit ASCII character and Chinese character in the 25 | hanzi string. One separator will be inserted between ASCII and pinyin unless 26 | the ASCII character is space. 27 | 28 | On error, return NULL. 29 | 30 | Caller should free the memory. */ 31 | char *hz2pinyin(const char *hanzi, const char *sep); 32 | 33 | #endif /* _HANZI_H */ 34 | -------------------------------------------------------------------------------- /c/test/Makefile: -------------------------------------------------------------------------------- 1 | CC = llvm-gcc 2 | LDFLAGS = -liconv 3 | CFLAGS = -g -std=gnu99 -I.. 4 | VPATH = .. 5 | 6 | TEST_RUN = sqlite-test bindata-test 7 | 8 | all: $(TEST_RUN) 9 | 10 | sqlite-test: test.o hanzi.o hanzi-sqlite.o unicode.o 11 | $(CC) $(CFLAGS) $(LDFLAGS) -lsqlite3 $^ -o $@ 12 | 13 | bindata-test: test.o hanzi.o hanzi-bindata.o unicode.o 14 | $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ 15 | 16 | clean: 17 | -rm -f *.o 18 | -rm -f $(TEST_RUN) 19 | -------------------------------------------------------------------------------- /c/test/test.c: -------------------------------------------------------------------------------- 1 | #include "unicode.h" 2 | #include "hanzi.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // #define DEBUG 9 | #include "util.h" 10 | 11 | static const int LINE_LEN_MAX = 255; 12 | 13 | typedef void (*test_func)(const char *, const char *, Encoding); 14 | 15 | static void test_hz2pinyin_codepoint(const char *s, const char *expected_pinyin, 16 | Encoding enc) { 17 | size_t n; 18 | UTF32 *cp; 19 | 20 | if (enc == ENCODING_UTF8) 21 | cp = hz_utf8_to_utf32(s, &n); 22 | else { 23 | printf("Not supported encoding\n"); 24 | return; 25 | } 26 | 27 | if (!cp) { 28 | printf("conversion to codepoint failed\n"); 29 | return; 30 | } 31 | DPRINTF("hanzi contains %lu characters", n); 32 | 33 | char pinyin[255] = { 0 }; 34 | char *pinyin_start = pinyin; 35 | 36 | DPRINTF("str: %s ", s); 37 | for (int i = 0; i < n; i++) { 38 | const char *py = hz2pinyin_codepoint(cp[i]); 39 | if (!py) { 40 | printf("No pinyin found for: %s cp: %x\n", s, cp[i]); 41 | return; 42 | } 43 | int len = strlen(py); 44 | strncpy(pinyin_start, py, len); 45 | pinyin_start += len; 46 | 47 | DPRINTF("%x %s ", cp[i], py); 48 | } 49 | 50 | if (strcmp(pinyin, expected_pinyin) != 0) { 51 | printf("ERROR, str: %s, expected_pinyin: %s, got: %s END\n", 52 | s, expected_pinyin, pinyin); 53 | } 54 | free(cp); 55 | } 56 | 57 | static void test_hz2pinyin_string(const char *s, const char *expected_pinyin, 58 | Encoding enc) { 59 | char *pinyin = hz2pinyin(s, " ", enc); 60 | if (!pinyin) { 61 | printf("ERROR converting hanzi %s to string\n", s); 62 | exit(1); 63 | } 64 | if (strcmp(pinyin, expected_pinyin) != 0) { 65 | printf("ERROR, str: %s, expected_pinyin: %s, got: %sEND\n", 66 | s, expected_pinyin, pinyin); 67 | } 68 | free(pinyin); 69 | } 70 | 71 | static void test(const char *path, Encoding enc, test_func func) { 72 | FILE *fp = fopen(path, "r"); 73 | if (!fp) { 74 | fprintf(stderr, "error opening file\n"); 75 | return; 76 | } 77 | 78 | while (1) { 79 | char c[LINE_LEN_MAX]; 80 | char pinyin[LINE_LEN_MAX]; 81 | 82 | fgets(c, LINE_LEN_MAX, fp); 83 | if (feof(fp)) 84 | break; 85 | fgets(pinyin, LINE_LEN_MAX, fp); 86 | 87 | /* Remove trailing "\n" 88 | * XXX strlen does not work with UTF-16 encoded byte sequence */ 89 | c[strlen(c) - 1] = '\0'; 90 | pinyin[strlen(pinyin) - 1] = '\0'; 91 | 92 | func(c, pinyin, enc); 93 | } 94 | fclose(fp); 95 | } 96 | 97 | int main(int argc, char* argv[]) 98 | { 99 | test("./utf8-test.txt", ENCODING_UTF8, test_hz2pinyin_codepoint); 100 | test("./utf8-test-mixed.txt", ENCODING_UTF8, test_hz2pinyin_string); 101 | 102 | /* 0x4E06 has no pinyin */ 103 | const char *pinyin = hz2pinyin_codepoint(0x4E06); 104 | assert(!pinyin); 105 | return 0; 106 | } 107 | -------------------------------------------------------------------------------- /c/test/utf8-test-mixed.txt: -------------------------------------------------------------------------------- 1 | 汉字拼音 2 | han zi pin yin 3 | 你 4 | ni 5 | 苹果 6 | ping guo 7 | 中英文mixed测试 8 | zhong ying wen mixed ce shi 9 | 中英文 mixed测试 10 | zhong ying wen mixed ce shi 11 | 中英文mixed 测试 12 | zhong ying wen mixed ce shi 13 | begin中英文mixed 测试 14 | begin zhong ying wen mixed ce shi 15 | -------------------------------------------------------------------------------- /c/test/utf8-test.txt: -------------------------------------------------------------------------------- 1 | 汉字拼音 2 | hanzipinyin 3 | 你 4 | ni 5 | 苹果 6 | pingguo 7 | -------------------------------------------------------------------------------- /c/unicode.c: -------------------------------------------------------------------------------- 1 | #include "unicode.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static iconv_t utf8_cd; 10 | 11 | static int init_cd() { 12 | CALL_ONCE(0); 13 | 14 | /* XXX Note the endianness. On intel based Mac, use little endian. */ 15 | utf8_cd = iconv_open("UTF-32LE", "UTF-8"); 16 | if (utf8_cd == (iconv_t)-1) { 17 | perror("iconv_open"); 18 | return 1; 19 | } 20 | return 0; 21 | } 22 | 23 | static UTF32 *convert(iconv_t cd, const char *s, size_t inbytes, size_t *nchar) { 24 | if (cd == (iconv_t) -1) 25 | return NULL; 26 | 27 | /* The upper bound of output memory required */ 28 | size_t outbytes = inbytes * sizeof(UTF32); 29 | char *outbuf = calloc(1, outbytes); 30 | char *outp = outbuf; 31 | 32 | /* Note iconv's return value is "number of characters converted in a non-reversible way". 33 | If we convert UTF-8 encoded string "abc汉字" to ASCII, as the last 2 characters can't 34 | be represented in ASCII, we may convert them to '?', this means we can't convert the 35 | resulting ASCII string back to the original UTF-8 string. So this conversion is 36 | non-reversible. */ 37 | size_t n = iconv(cd, (char **)&s, &inbytes, &outp, &outbytes); 38 | if (n == (size_t) -1) { 39 | perror("convert"); 40 | free(outbuf); 41 | return NULL; 42 | } 43 | if (nchar) 44 | *nchar = (outp - outbuf) / sizeof(UTF32); 45 | return (UTF32 *)outbuf; 46 | } 47 | 48 | UTF32 *hz_utf8_to_utf32(const char *s, size_t *nchar) { 49 | init_cd(); 50 | return convert(utf8_cd, s, strlen(s), nchar); 51 | } 52 | -------------------------------------------------------------------------------- /c/unicode.h: -------------------------------------------------------------------------------- 1 | #ifndef _UNICODE_H 2 | #define _UNICODE_H 3 | 4 | #include 5 | #include 6 | 7 | /* I'm not going to support UTF16. It's awful to handle UTF16 strings using 8 | native C string function. */ 9 | 10 | typedef uint8_t UTF8; 11 | typedef uint32_t UTF32; 12 | 13 | /* Convert UTF-8 string UTF-32. nchar will be set to the number of characters. 14 | * Caller should free memory. */ 15 | UTF32 *hz_utf8_to_utf32(const char *s, size_t *nchar); 16 | 17 | #endif /* _UNICODE_H */ 18 | -------------------------------------------------------------------------------- /c/util.h: -------------------------------------------------------------------------------- 1 | #ifndef _UTIL_H 2 | #define _UTIL_H 3 | 4 | #define CALL_ONCE(retcode) \ 5 | static bool __called = false; \ 6 | if (__called) \ 7 | return 0; \ 8 | else \ 9 | __called = true; \ 10 | 11 | #ifdef DEBUG 12 | # define DPRINTF(fmt, ...) \ 13 | fprintf(stderr, "%s:%d: %s: " fmt "\n", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__) 14 | #else 15 | # define DPRINTF(fmt, ...) 16 | #endif 17 | 18 | #endif -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Refer to the Unicode standard chapter 12 for more information about the Unified 2 | CJK Ideographs. Table 12-2 has the codepoint range for different blocks. 3 | 4 | Notes for the codepoint to pinyin map files 5 | 6 | - Unihan/unihan-codepoint2py.txt 7 | - The data is from the Unihan database 8 | ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip, unicode version 6.0.0 9 | - Created the pinyin mapping file from Unihan\_Readings.txt 10 | - More details in Unihan/process-unihan-readings.rb 11 | 12 | At first I tried to use data from other sources, but these data has some 13 | problems. Just document the experience here, data is not included. 14 | 15 | - fcitx 16 | - Data file ``gbkpy.org`` in fcitx 17 | - Covers GBK character set, only one character is not in the 1st block 18 | - Problem: some characters have weird pinyin (maybe used in dialect) 19 | - chinese\_pinyin from https://github.com/flyerhzm/chinese\_pinyin 20 | - Data file ``Mandarin.dat`` 21 | - Contains the characters in the 1st block, extension A and B 22 | - Problem: the 1st block characters are not completely covered. Some 23 | characters covered in fcitx's data file is not present in this one. 24 | -------------------------------------------------------------------------------- /data/Unihan/block1-pinyin-statistic.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Extract all the possible pinyin combination 4 | 5 | py_tbl = [] 6 | py_set = {} 7 | 8 | multipy_cnt = 0 # 3651 9 | # characters with less than or equal to 4 bytes pinyin 10 | le4py_cnt = 0 # 15736 11 | total_cnt = 0 # 20253 12 | longest_pinyin = 0 13 | 14 | File.open('./block1-codepoint2pinyin.txt') do |f| 15 | f.each_line do |line| 16 | arr = line.split(' ') 17 | 18 | cp = arr[0].to_i(16) 19 | py = arr[1, arr.size - 1] 20 | longest_pinyin = py[0].size if py[0].size > longest_pinyin 21 | 22 | multipy_cnt += 1 if py.size > 1 23 | # only consider the first pinyin 24 | le4py_cnt += 1 if py.size == 1 and py[0].size <= 4 25 | total_cnt += 1 26 | 27 | pyid = py_set[py] 28 | unless pyid 29 | #py_tbl << py 30 | py_tbl << py[0] # We have only one pinyin for each Hanzi in the input data 31 | py_set[py] = py_tbl.size 32 | end 33 | end 34 | end 35 | 36 | puts "Number of multi-pinyin characters: #{multipy_cnt}" 37 | puts "Number of characters with less than or equal to 4 bytes pinyin: #{le4py_cnt}" 38 | puts "Longest pinyin: #{longest_pinyin}" 39 | puts "Total number of characters: #{total_cnt}" 40 | puts "Total number of pinyin: #{py_tbl.size}" 41 | p py_tbl.sort 42 | -------------------------------------------------------------------------------- /data/Unihan/create-bindata.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | pinyin_tbl = ["a", "ai", "an", "ang", "ao", "ba", "bai", "ban", "bang", "bao", 4 | "bei", "ben", "beng", "bi", "bian", "biao", "bie", "bin", "bing", "bo", "bu", 5 | "ca", "cai", "can", "cang", "cao", "ce", "cen", "ceng", "cha", "chai", "chan", 6 | "chang", "chao", "che", "chen", "cheng", "chi", "chong", "chou", "chu", 7 | "chua", "chuai", "chuan", "chuang", "chui", "chun", "chuo", "ci", "cong", 8 | "cou", "cu", "cuan", "cui", "cun", "cuo", "da", "dai", "dan", "dang", "dao", 9 | "de", "den", "deng", "di", "dia", "dian", "diao", "die", "ding", "diu", 10 | "dong", "dou", "du", "duan", "dui", "dun", "duo", "e", "ei", "en", "eng", 11 | "er", "fa", "fan", "fang", "fei", "fen", "feng", "fiao", "fo", "fou", "fu", 12 | "ga", "gai", "gan", "gang", "gao", "ge", "gei", "gen", "geng", "gong", "gou", 13 | "gu", "gua", "guai", "guan", "guang", "gui", "gun", "guo", "ha", "hai", "han", 14 | "hang", "hao", "he", "hei", "hen", "heng", "hm", "hong", "hou", "hu", "hua", 15 | "huai", "huan", "huang", "hui", "hun", "huo", "ji", "jia", "jian", "jiang", 16 | "jiao", "jie", "jin", "jing", "jiong", "jiu", "ju", "juan", "jue", "jun", 17 | "ka", "kai", "kan", "kang", "kao", "ke", "ken", "keng", "kong", "kou", "ku", 18 | "kua", "kuai", "kuan", "kuang", "kui", "kun", "kuo", "la", "lai", "lan", 19 | "lang", "lao", "le", "lei", "leng", "li", "lia", "lian", "liang", "liao", 20 | "lie", "lin", "ling", "liu", "long", "lou", "lu", "luan", "lun", "luo", "lv", 21 | "lve", "m", "ma", "mai", "man", "mang", "mao", "me", "mei", "men", "meng", 22 | "mi", "mian", "miao", "mie", "min", "ming", "miu", "mo", "mou", "mu", "n", 23 | "na", "nai", "nan", "nang", "nao", "ne", "nei", "nen", "neng", "ni", "nian", 24 | "niang", "niao", "nie", "nin", "ning", "niu", "nong", "nou", "nu", "nuan", 25 | "nun", "nuo", "nv", "nve", "o", "ou", "pa", "pai", "pan", "pang", "pao", 26 | "pei", "pen", "peng", "pi", "pian", "piao", "pie", "pin", "ping", "po", "pou", 27 | "pu", "qi", "qia", "qian", "qiang", "qiao", "qie", "qin", "qing", "qiong", 28 | "qiu", "qu", "quan", "que", "qun", "r", "ran", "rang", "rao", "re", "ren", 29 | "reng", "ri", "rong", "rou", "ru", "rua", "ruan", "rui", "run", "ruo", "sa", 30 | "sai", "san", "sang", "sao", "se", "sen", "seng", "sha", "shai", "shan", 31 | "shang", "shao", "she", "shen", "sheng", "shi", "shou", "shu", "shua", 32 | "shuai", "shuan", "shuang", "shui", "shun", "shuo", "si", "song", "sou", "su", 33 | "suan", "sui", "sun", "suo", "ta", "tai", "tan", "tang", "tao", "te", "teng", 34 | "ti", "tian", "tiao", "tie", "ting", "tong", "tou", "tu", "tuan", "tui", 35 | "tun", "tuo", "wa", "wai", "wan", "wang", "wei", "wen", "weng", "wo", "wu", 36 | "xi", "xia", "xian", "xiang", "xiao", "xie", "xin", "xing", "xiong", "xiu", 37 | "xu", "xuan", "xue", "xun", "ya", "yan", "yang", "yao", "ye", "yi", "yin", 38 | "ying", "yo", "yong", "you", "yu", "yuan", "yue", "yun", "za", "zai", "zan", 39 | "zang", "zao", "ze", "zei", "zen", "zeng", "zha", "zhai", "zhan", "zhang", 40 | "zhao", "zhe", "zhen", "zheng", "zhi", "zhong", "zhou", "zhu", "zhua", 41 | "zhuai", "zhuan", "zhuang", "zhui", "zhun", "zhuo", "zi", "zong", "zou", "zu", 42 | "zuan", "zui", "zun", "zuo"] 43 | 44 | pinyin_id = {} 45 | 46 | pinyin_tbl.each_with_index do |py, id| 47 | pinyin_id[py] = id 48 | end 49 | 50 | next_cp = 0x4E00 51 | 52 | def output_int_binary(out, i) 53 | # "s" for 16-bit signed integer, native endian 54 | # If need big-endian, use "n" which stands for 16-bit unsigned integer, 55 | # network (big-endian) byte order 56 | out.write([i].pack("s")) 57 | end 58 | 59 | File.open('./block1-codepoint2pinyin.txt') do |f| 60 | File.open('./pinyin.dat', 'w') do |out| 61 | f.each_line do |line| 62 | cp, py = line.split(' ') 63 | cp = cp.to_i(16) 64 | pyid = pinyin_id[py] 65 | 66 | if cp > next_cp 67 | (cp - next_cp).times do 68 | output_int_binary(out, 0xFFFF) 69 | end 70 | end 71 | 72 | output_int_binary(out, pyid) 73 | next_cp = cp + 1 74 | end 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /data/Unihan/create-sqlitedb.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'sqlite3' 5 | 6 | if ARGV.size != 1 7 | puts "Usage: #{$0} " 8 | exit 1 9 | end 10 | 11 | maptxt = ARGV[0] 12 | 13 | db = SQLite3::Database.new("./codepoint2pinyin.db") 14 | 15 | db.execute('create table codepoint2pinyin (codepoint integer, pinyin text);') 16 | db.prepare("insert into codepoint2pinyin values (:codepoint, :pinyin)") do |stmt| 17 | File.open(maptxt) do |f| 18 | f.each_line do |line| 19 | arr = line.chomp.split(' ', 2) 20 | cp = arr[0].to_i(16) 21 | py = arr[1] 22 | 23 | stmt.execute(:codepoint => cp, :pinyin => py) 24 | end 25 | end 26 | end 27 | 28 | db.close 29 | -------------------------------------------------------------------------------- /data/Unihan/extract-block1.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | File.open('./unihan-codepoint2pinyin.txt') do |f| 4 | File.open('./block1-codepoint2pinyin.txt', 'w') do |out| 5 | f.each_line do |line| 6 | # Calling split directly on will have problem for some line. 7 | # Maybe split can't work correctly on some unicode character? 8 | # eg. U+5009,A,B,C,D 9 | id = line.index(' ') 10 | arr = line[id, line.size - id].split(' ') 11 | cp = arr[0].to_i(16) 12 | if cp > 0x9FFF 13 | break 14 | elsif 0x4E00 <= cp and cp <= 0x9FFF 15 | # Only take the first pinyin, do not handle multi location pinyin now. 16 | out.puts("#{arr[0]} #{arr[1]}") 17 | end 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /data/Unihan/generate-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Processing Unihan_Readings.txt" 4 | ./process-unihan-readings.rb 5 | echo "Extracting block1 characters data" 6 | ./extract-block1.rb 7 | echo "Creating binary data" 8 | ./create-bindata.rb 9 | echo "Binray data pinyin.dat generated" 10 | -------------------------------------------------------------------------------- /data/Unihan/process-unihan-readings.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # coding: utf-8 3 | 4 | # Runs with ruby 1.9 5 | 6 | # As noted in ICU http://site.icu-project.org/design/alphabetic-index 7 | # We should use the pinyin listed in Unihan_Readings.txt in the following order 8 | # 1. kHanyuPinlu 9 | # 2. kXHC1983 10 | # 3. kHanyuPinyin 11 | # 4. kMandarin 12 | # 13 | # I noticed that the pinyin file generated using only kMandarin has has weird 14 | # pinyin for some hanzi that I do not know. Maybe they are used in some dialect, 15 | # but I want to exclude them. The above order of fields order does give better 16 | # results. 17 | 18 | class UniProcess 19 | @@tone2char_tbl = [ 20 | ['ā', 'a'], ['á', 'a'], ['ǎ', 'a'], ['à', 'a'], 21 | ['ō', 'o'], ['ó', 'o'], ['ǒ', 'o'], ['ò', 'o'], 22 | ['ē', 'e'], ['é', 'e'], ['ě', 'e'], ['è', 'e'], 23 | ['ī', 'i'], ['í', 'i'], ['ǐ', 'i'], ['ì', 'i'], 24 | ['ū', 'u'], ['ú', 'u'], ['ǔ', 'u'], ['ù', 'u'], 25 | ['ü', 'v'], ['ǘ', 'v'], ['ǚ', 'v'], ['ǜ', 'v'], ['Ü', 'v'], 26 | ['ń', 'n'], ['ň', 'n'], ['ǹ', 'n'], 27 | ['ḿ', 'm'], ['m̄', 'm'], ['m̀', 'm'], 28 | ] 29 | 30 | def remove_tone(py) 31 | py[py.size - 1] =~ /\d/ ? py[0, py.size - 1] : py 32 | end 33 | 34 | def remove_tonechar(py) 35 | @@tone2char_tbl.each { |tone, c| py.gsub!(tone, c) } 36 | # Check that if there's any tone character not removed 37 | # In this way I found some tone characters I do not know before 38 | # Manually editted U+6b38 and U+8a92 in the final output 39 | py.each_char do |c| 40 | puts "Tone character at #{@cp.to_s(16)} #{c}" if (not ('a' <= c and c <= 'z')) and c != ',' 41 | end 42 | py 43 | end 44 | 45 | def extract_HanyuPinlu(py) 46 | # sang4(22) --> sang 47 | remove_tonechar(remove_tone py.sub(/\(\d*\)/, '')) 48 | end 49 | 50 | def extract_HanyuPinyin(py) 51 | # 10028.020:gǔn,zhu --> [gun, zhu] 52 | py = py.sub(/^[^:]*:/, '') 53 | py = remove_tonechar py 54 | py_arr = py.split(',').map &method(:remove_tone) 55 | end 56 | 57 | def extract_XHC1983(py) 58 | # 10028.020:gǔn --> gun 59 | py = py.sub(/^[^:]*:/, '') 60 | remove_tone(remove_tonechar(py)) 61 | end 62 | 63 | def extract_Mandarin(py) 64 | # sang4 --> sang 65 | remove_tonechar(remove_tone(py)) 66 | end 67 | 68 | def process_pinyin(py_arr, extract) 69 | py_arr = py_arr.map &method(extract) 70 | py_arr.flatten! 71 | py_arr.uniq! 72 | py_arr 73 | end 74 | 75 | def select_first_py(py_tbl) 76 | #p py_tbl 77 | py_tbl.each do |py| 78 | return py if py 79 | end 80 | end 81 | 82 | def write_output(out, cp, py_tbl) 83 | return if py_tbl == [] 84 | hz = cp.chr('UTF-8') 85 | py = select_first_py(py_tbl) 86 | 87 | out.puts("#{hz} #{cp.to_s(16).upcase} #{py}") 88 | # clear the tbl for next character's use 89 | py_tbl.clear 90 | end 91 | 92 | def process 93 | File.open("./Unihan_Readings.txt") do |f| 94 | File.open('./unihan-codepoint2pinyin.txt', 'w') do |out| 95 | # store all possible 4 fields pinyin here 96 | py_tbl = [] 97 | prev_cp = nil 98 | @count = 0 99 | 100 | f.each_line do |line| 101 | # skip comment 102 | next if line == "\n" or line[0] == '#' 103 | 104 | arr = line.chomp.split("\t") 105 | cp = arr[0] 106 | cp = cp[2, cp.size - 2].to_i(16) 107 | @cp = cp 108 | 109 | #puts "#{prev_cp.to_s(16)} #{cp.to_s(16)}" if prev_cp 110 | if prev_cp and cp != prev_cp 111 | write_output(out, prev_cp, py_tbl) 112 | @count += 1 113 | #exit if count >= 5 114 | end 115 | 116 | field = arr[1] 117 | pys = arr[2] 118 | if pys 119 | pys = pys.split(' ').map { |py| py.downcase } 120 | else 121 | next 122 | end 123 | 124 | case field 125 | when "kHanyuPinlu" 126 | py_arr = process_pinyin(pys, :extract_HanyuPinlu) 127 | py_tbl[0] = py_arr.join(' ') 128 | when "kXHC1983" 129 | py_arr = process_pinyin(pys, :extract_XHC1983) 130 | py_tbl[1] = py_arr.join(' ') 131 | when "kHanyuPinyin" 132 | py_arr = process_pinyin(pys, :extract_HanyuPinyin) 133 | py_tbl[2] = py_arr.join(' ') 134 | when "kMandarin" 135 | py_arr = process_pinyin(pys, :extract_Mandarin) 136 | py_tbl[3] = py_arr.join(' ') 137 | end 138 | 139 | prev_cp = cp 140 | end 141 | end 142 | end 143 | end 144 | end 145 | 146 | up = UniProcess.new 147 | up.process 148 | 149 | -------------------------------------------------------------------------------- /data/Unihan/test-db.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'rubygems' 4 | require 'sqlite3' 5 | require 'minitest/autorun' 6 | 7 | class TestDB < MiniTest::Unit::TestCase 8 | def setup 9 | db = SQLite3::Database.new("./codepoint2pinyin.db") 10 | @query = db.prepare("select pinyin from codepoint2pinyin where codepoint = ?") 11 | end 12 | 13 | def get_pinyin(c) 14 | @query.execute(c.codepoints.first) do |result| 15 | return result.next[0] 16 | end 17 | end 18 | 19 | def test_pinyin 20 | assert_equal "ni", get_pinyin('你') 21 | assert_equal "chen", get_pinyin('陈') 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /data/Unihan/test-process.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'minitest/autorun' 4 | 5 | class TestDB < MiniTest::Unit::TestCase 6 | def setup 7 | @@up = UniProcess.new 8 | end 9 | 10 | def test_Mandarin 11 | assert_equal "sang", @@up.extract_Mandarin("sang4") 12 | assert_equal "chen", @@up.extract_Mandarin("chen") 13 | 14 | assert_equal ["sang", "chen"], 15 | @@up.process_pinyin(["sang4", "chen"], :extract_Mandarin) 16 | end 17 | 18 | def test_HanyuPinlu 19 | assert_equal "yi", @@up.extract_HanyuPinlu("yi1(32747)") 20 | assert_equal "shang", @@up.extract_HanyuPinlu("shang4(12308)") 21 | 22 | assert_equal ["hai", "luo"], 23 | @@up.process_pinyin(["hai4(123)", "luo(55)"], :extract_HanyuPinlu) 24 | end 25 | 26 | def test_HanyuPinyin 27 | assert_equal ["san"], @@up.extract_HanyuPinyin("10004.030:sān") 28 | assert_equal ["shang", "shang"], @@up.extract_HanyuPinyin("10005.040:shàng,shǎng") 29 | 30 | assert_equal ["shang", "san"], 31 | @@up.process_pinyin(["10005.040:shàng,shǎng", "10004.030:sān"], 32 | :extract_HanyuPinyin) 33 | end 34 | 35 | def test_XHC1983 36 | assert_equal "shang", @@up.extract_XHC1983("1002.050:shǎng") 37 | 38 | assert_equal ["shang", "san"], 39 | @@up.process_pinyin(["1002.050:shǎng", "10004.030:sān"], 40 | :extract_XHC1983) 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 46; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | FA06E98A154FC40E00BD92C4 /* Hanzi2Pinyin.h in Headers */ = {isa = PBXBuildFile; fileRef = FAD5FC4B144019540057FC2C /* Hanzi2Pinyin.h */; settings = {ATTRIBUTES = (Public, ); }; }; 11 | FA39C4BA1553B22A0059A4B4 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FAD5FC43144019540057FC2C /* Foundation.framework */; }; 12 | FA5AB46E1550FC0400653BCE /* pinyin.dat in Resources */ = {isa = PBXBuildFile; fileRef = FAE3CA97154FAB5900F563C2 /* pinyin.dat */; }; 13 | FAD5FC49144019540057FC2C /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = FAD5FC47144019540057FC2C /* InfoPlist.strings */; }; 14 | FAD5FC4D144019540057FC2C /* Hanzi2Pinyin.m in Sources */ = {isa = PBXBuildFile; fileRef = FAD5FC4C144019540057FC2C /* Hanzi2Pinyin.m */; }; 15 | FAD5FC55144019540057FC2C /* SenTestingKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FAD5FC54144019540057FC2C /* SenTestingKit.framework */; }; 16 | FAD5FC59144019540057FC2C /* Hanzi2Pinyin.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FAD5FC3B144019540057FC2C /* Hanzi2Pinyin.framework */; }; 17 | FAD5FC5F144019540057FC2C /* InfoPlist.strings in Resources */ = {isa = PBXBuildFile; fileRef = FAD5FC5D144019540057FC2C /* InfoPlist.strings */; }; 18 | FAD5FC61144019540057FC2C /* Hanzi2PinyinTests.h in Resources */ = {isa = PBXBuildFile; fileRef = FAD5FC60144019540057FC2C /* Hanzi2PinyinTests.h */; }; 19 | FAD5FC63144019540057FC2C /* Hanzi2PinyinTests.m in Sources */ = {isa = PBXBuildFile; fileRef = FAD5FC62144019540057FC2C /* Hanzi2PinyinTests.m */; }; 20 | /* End PBXBuildFile section */ 21 | 22 | /* Begin PBXContainerItemProxy section */ 23 | FAD5FC57144019540057FC2C /* PBXContainerItemProxy */ = { 24 | isa = PBXContainerItemProxy; 25 | containerPortal = FAD5FC31144019540057FC2C /* Project object */; 26 | proxyType = 1; 27 | remoteGlobalIDString = FAD5FC3A144019540057FC2C; 28 | remoteInfo = Hanzi2Pinyin; 29 | }; 30 | /* End PBXContainerItemProxy section */ 31 | 32 | /* Begin PBXFileReference section */ 33 | FAD5FC3B144019540057FC2C /* Hanzi2Pinyin.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Hanzi2Pinyin.framework; sourceTree = BUILT_PRODUCTS_DIR; }; 34 | FAD5FC43144019540057FC2C /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 35 | FAD5FC46144019540057FC2C /* Hanzi2Pinyin-Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "Hanzi2Pinyin-Info.plist"; sourceTree = ""; }; 36 | FAD5FC48144019540057FC2C /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/InfoPlist.strings; sourceTree = ""; }; 37 | FAD5FC4A144019540057FC2C /* Hanzi2Pinyin-Prefix.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "Hanzi2Pinyin-Prefix.pch"; sourceTree = ""; }; 38 | FAD5FC4B144019540057FC2C /* Hanzi2Pinyin.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = Hanzi2Pinyin.h; sourceTree = ""; }; 39 | FAD5FC4C144019540057FC2C /* Hanzi2Pinyin.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = Hanzi2Pinyin.m; sourceTree = ""; }; 40 | FAD5FC53144019540057FC2C /* Hanzi2PinyinTests.octest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Hanzi2PinyinTests.octest; sourceTree = BUILT_PRODUCTS_DIR; }; 41 | FAD5FC54144019540057FC2C /* SenTestingKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = SenTestingKit.framework; path = Library/Frameworks/SenTestingKit.framework; sourceTree = DEVELOPER_DIR; }; 42 | FAD5FC5C144019540057FC2C /* Hanzi2PinyinTests-Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "Hanzi2PinyinTests-Info.plist"; sourceTree = ""; }; 43 | FAD5FC5E144019540057FC2C /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/InfoPlist.strings; sourceTree = ""; }; 44 | FAD5FC60144019540057FC2C /* Hanzi2PinyinTests.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = Hanzi2PinyinTests.h; sourceTree = ""; }; 45 | FAD5FC62144019540057FC2C /* Hanzi2PinyinTests.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = Hanzi2PinyinTests.m; sourceTree = ""; }; 46 | FAE3CA97154FAB5900F563C2 /* pinyin.dat */ = {isa = PBXFileReference; lastKnownFileType = file; path = pinyin.dat; sourceTree = ""; }; 47 | /* End PBXFileReference section */ 48 | 49 | /* Begin PBXFrameworksBuildPhase section */ 50 | FAD5FC37144019540057FC2C /* Frameworks */ = { 51 | isa = PBXFrameworksBuildPhase; 52 | buildActionMask = 2147483647; 53 | files = ( 54 | FA39C4BA1553B22A0059A4B4 /* Foundation.framework in Frameworks */, 55 | ); 56 | runOnlyForDeploymentPostprocessing = 0; 57 | }; 58 | FAD5FC4F144019540057FC2C /* Frameworks */ = { 59 | isa = PBXFrameworksBuildPhase; 60 | buildActionMask = 2147483647; 61 | files = ( 62 | FAD5FC55144019540057FC2C /* SenTestingKit.framework in Frameworks */, 63 | FAD5FC59144019540057FC2C /* Hanzi2Pinyin.framework in Frameworks */, 64 | ); 65 | runOnlyForDeploymentPostprocessing = 0; 66 | }; 67 | /* End PBXFrameworksBuildPhase section */ 68 | 69 | /* Begin PBXGroup section */ 70 | FAD5FC2F144019540057FC2C = { 71 | isa = PBXGroup; 72 | children = ( 73 | FAD5FC44144019540057FC2C /* Hanzi2Pinyin */, 74 | FAD5FC5A144019540057FC2C /* Hanzi2PinyinTests */, 75 | FAD5FC3D144019540057FC2C /* Frameworks */, 76 | FAD5FC3C144019540057FC2C /* Products */, 77 | ); 78 | sourceTree = ""; 79 | }; 80 | FAD5FC3C144019540057FC2C /* Products */ = { 81 | isa = PBXGroup; 82 | children = ( 83 | FAD5FC3B144019540057FC2C /* Hanzi2Pinyin.framework */, 84 | FAD5FC53144019540057FC2C /* Hanzi2PinyinTests.octest */, 85 | ); 86 | name = Products; 87 | sourceTree = ""; 88 | }; 89 | FAD5FC3D144019540057FC2C /* Frameworks */ = { 90 | isa = PBXGroup; 91 | children = ( 92 | FAD5FC43144019540057FC2C /* Foundation.framework */, 93 | FAD5FC54144019540057FC2C /* SenTestingKit.framework */, 94 | ); 95 | name = Frameworks; 96 | sourceTree = ""; 97 | }; 98 | FAD5FC44144019540057FC2C /* Hanzi2Pinyin */ = { 99 | isa = PBXGroup; 100 | children = ( 101 | FAE3CA97154FAB5900F563C2 /* pinyin.dat */, 102 | FAD5FC4B144019540057FC2C /* Hanzi2Pinyin.h */, 103 | FAD5FC4C144019540057FC2C /* Hanzi2Pinyin.m */, 104 | FAD5FC45144019540057FC2C /* Supporting Files */, 105 | ); 106 | path = Hanzi2Pinyin; 107 | sourceTree = ""; 108 | }; 109 | FAD5FC45144019540057FC2C /* Supporting Files */ = { 110 | isa = PBXGroup; 111 | children = ( 112 | FAD5FC46144019540057FC2C /* Hanzi2Pinyin-Info.plist */, 113 | FAD5FC47144019540057FC2C /* InfoPlist.strings */, 114 | FAD5FC4A144019540057FC2C /* Hanzi2Pinyin-Prefix.pch */, 115 | ); 116 | name = "Supporting Files"; 117 | sourceTree = ""; 118 | }; 119 | FAD5FC5A144019540057FC2C /* Hanzi2PinyinTests */ = { 120 | isa = PBXGroup; 121 | children = ( 122 | FAD5FC60144019540057FC2C /* Hanzi2PinyinTests.h */, 123 | FAD5FC62144019540057FC2C /* Hanzi2PinyinTests.m */, 124 | FAD5FC5B144019540057FC2C /* Supporting Files */, 125 | ); 126 | path = Hanzi2PinyinTests; 127 | sourceTree = ""; 128 | }; 129 | FAD5FC5B144019540057FC2C /* Supporting Files */ = { 130 | isa = PBXGroup; 131 | children = ( 132 | FAD5FC5C144019540057FC2C /* Hanzi2PinyinTests-Info.plist */, 133 | FAD5FC5D144019540057FC2C /* InfoPlist.strings */, 134 | ); 135 | name = "Supporting Files"; 136 | sourceTree = ""; 137 | }; 138 | /* End PBXGroup section */ 139 | 140 | /* Begin PBXHeadersBuildPhase section */ 141 | FAD5FC38144019540057FC2C /* Headers */ = { 142 | isa = PBXHeadersBuildPhase; 143 | buildActionMask = 2147483647; 144 | files = ( 145 | FA06E98A154FC40E00BD92C4 /* Hanzi2Pinyin.h in Headers */, 146 | ); 147 | runOnlyForDeploymentPostprocessing = 0; 148 | }; 149 | /* End PBXHeadersBuildPhase section */ 150 | 151 | /* Begin PBXNativeTarget section */ 152 | FAD5FC3A144019540057FC2C /* Hanzi2Pinyin */ = { 153 | isa = PBXNativeTarget; 154 | buildConfigurationList = FAD5FC66144019540057FC2C /* Build configuration list for PBXNativeTarget "Hanzi2Pinyin" */; 155 | buildPhases = ( 156 | FAD5FC36144019540057FC2C /* Sources */, 157 | FAD5FC37144019540057FC2C /* Frameworks */, 158 | FAD5FC38144019540057FC2C /* Headers */, 159 | FAD5FC39144019540057FC2C /* Resources */, 160 | ); 161 | buildRules = ( 162 | ); 163 | dependencies = ( 164 | ); 165 | name = Hanzi2Pinyin; 166 | productName = Hanzi2Pinyin; 167 | productReference = FAD5FC3B144019540057FC2C /* Hanzi2Pinyin.framework */; 168 | productType = "com.apple.product-type.framework"; 169 | }; 170 | FAD5FC52144019540057FC2C /* Hanzi2PinyinTests */ = { 171 | isa = PBXNativeTarget; 172 | buildConfigurationList = FAD5FC69144019540057FC2C /* Build configuration list for PBXNativeTarget "Hanzi2PinyinTests" */; 173 | buildPhases = ( 174 | FAD5FC4E144019540057FC2C /* Sources */, 175 | FAD5FC4F144019540057FC2C /* Frameworks */, 176 | FAD5FC50144019540057FC2C /* Resources */, 177 | FAD5FC51144019540057FC2C /* ShellScript */, 178 | ); 179 | buildRules = ( 180 | ); 181 | dependencies = ( 182 | FAD5FC58144019540057FC2C /* PBXTargetDependency */, 183 | ); 184 | name = Hanzi2PinyinTests; 185 | productName = Hanzi2PinyinTests; 186 | productReference = FAD5FC53144019540057FC2C /* Hanzi2PinyinTests.octest */; 187 | productType = "com.apple.product-type.bundle"; 188 | }; 189 | /* End PBXNativeTarget section */ 190 | 191 | /* Begin PBXProject section */ 192 | FAD5FC31144019540057FC2C /* Project object */ = { 193 | isa = PBXProject; 194 | attributes = { 195 | LastUpgradeCheck = 0430; 196 | }; 197 | buildConfigurationList = FAD5FC34144019540057FC2C /* Build configuration list for PBXProject "Hanzi2Pinyin" */; 198 | compatibilityVersion = "Xcode 3.2"; 199 | developmentRegion = English; 200 | hasScannedForEncodings = 0; 201 | knownRegions = ( 202 | en, 203 | ); 204 | mainGroup = FAD5FC2F144019540057FC2C; 205 | productRefGroup = FAD5FC3C144019540057FC2C /* Products */; 206 | projectDirPath = ""; 207 | projectRoot = ""; 208 | targets = ( 209 | FAD5FC3A144019540057FC2C /* Hanzi2Pinyin */, 210 | FAD5FC52144019540057FC2C /* Hanzi2PinyinTests */, 211 | ); 212 | }; 213 | /* End PBXProject section */ 214 | 215 | /* Begin PBXResourcesBuildPhase section */ 216 | FAD5FC39144019540057FC2C /* Resources */ = { 217 | isa = PBXResourcesBuildPhase; 218 | buildActionMask = 2147483647; 219 | files = ( 220 | FA5AB46E1550FC0400653BCE /* pinyin.dat in Resources */, 221 | FAD5FC49144019540057FC2C /* InfoPlist.strings in Resources */, 222 | ); 223 | runOnlyForDeploymentPostprocessing = 0; 224 | }; 225 | FAD5FC50144019540057FC2C /* Resources */ = { 226 | isa = PBXResourcesBuildPhase; 227 | buildActionMask = 2147483647; 228 | files = ( 229 | FAD5FC5F144019540057FC2C /* InfoPlist.strings in Resources */, 230 | FAD5FC61144019540057FC2C /* Hanzi2PinyinTests.h in Resources */, 231 | ); 232 | runOnlyForDeploymentPostprocessing = 0; 233 | }; 234 | /* End PBXResourcesBuildPhase section */ 235 | 236 | /* Begin PBXShellScriptBuildPhase section */ 237 | FAD5FC51144019540057FC2C /* ShellScript */ = { 238 | isa = PBXShellScriptBuildPhase; 239 | buildActionMask = 2147483647; 240 | files = ( 241 | ); 242 | inputPaths = ( 243 | ); 244 | outputPaths = ( 245 | ); 246 | runOnlyForDeploymentPostprocessing = 0; 247 | shellPath = /bin/sh; 248 | shellScript = "# Run the unit tests in this test bundle.\n\"${SYSTEM_DEVELOPER_DIR}/Tools/RunUnitTests\"\n"; 249 | }; 250 | /* End PBXShellScriptBuildPhase section */ 251 | 252 | /* Begin PBXSourcesBuildPhase section */ 253 | FAD5FC36144019540057FC2C /* Sources */ = { 254 | isa = PBXSourcesBuildPhase; 255 | buildActionMask = 2147483647; 256 | files = ( 257 | FAD5FC4D144019540057FC2C /* Hanzi2Pinyin.m in Sources */, 258 | ); 259 | runOnlyForDeploymentPostprocessing = 0; 260 | }; 261 | FAD5FC4E144019540057FC2C /* Sources */ = { 262 | isa = PBXSourcesBuildPhase; 263 | buildActionMask = 2147483647; 264 | files = ( 265 | FAD5FC63144019540057FC2C /* Hanzi2PinyinTests.m in Sources */, 266 | ); 267 | runOnlyForDeploymentPostprocessing = 0; 268 | }; 269 | /* End PBXSourcesBuildPhase section */ 270 | 271 | /* Begin PBXTargetDependency section */ 272 | FAD5FC58144019540057FC2C /* PBXTargetDependency */ = { 273 | isa = PBXTargetDependency; 274 | target = FAD5FC3A144019540057FC2C /* Hanzi2Pinyin */; 275 | targetProxy = FAD5FC57144019540057FC2C /* PBXContainerItemProxy */; 276 | }; 277 | /* End PBXTargetDependency section */ 278 | 279 | /* Begin PBXVariantGroup section */ 280 | FAD5FC47144019540057FC2C /* InfoPlist.strings */ = { 281 | isa = PBXVariantGroup; 282 | children = ( 283 | FAD5FC48144019540057FC2C /* en */, 284 | ); 285 | name = InfoPlist.strings; 286 | sourceTree = ""; 287 | }; 288 | FAD5FC5D144019540057FC2C /* InfoPlist.strings */ = { 289 | isa = PBXVariantGroup; 290 | children = ( 291 | FAD5FC5E144019540057FC2C /* en */, 292 | ); 293 | name = InfoPlist.strings; 294 | sourceTree = ""; 295 | }; 296 | /* End PBXVariantGroup section */ 297 | 298 | /* Begin XCBuildConfiguration section */ 299 | FAD5FC64144019540057FC2C /* Debug */ = { 300 | isa = XCBuildConfiguration; 301 | buildSettings = { 302 | ALWAYS_SEARCH_USER_PATHS = NO; 303 | ARCHS = "$(ARCHS_STANDARD_64_BIT)"; 304 | COPY_PHASE_STRIP = NO; 305 | GCC_C_LANGUAGE_STANDARD = gnu99; 306 | GCC_DYNAMIC_NO_PIC = NO; 307 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 308 | GCC_OPTIMIZATION_LEVEL = 0; 309 | GCC_PREPROCESSOR_DEFINITIONS = ( 310 | "DEBUG=1", 311 | "$(inherited)", 312 | ); 313 | GCC_SYMBOLS_PRIVATE_EXTERN = NO; 314 | GCC_VERSION = com.apple.compilers.llvm.clang.1_0; 315 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 316 | GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES; 317 | GCC_WARN_ABOUT_RETURN_TYPE = YES; 318 | GCC_WARN_UNUSED_VARIABLE = YES; 319 | MACOSX_DEPLOYMENT_TARGET = 10.7; 320 | ONLY_ACTIVE_ARCH = YES; 321 | SDKROOT = macosx; 322 | }; 323 | name = Debug; 324 | }; 325 | FAD5FC65144019540057FC2C /* Release */ = { 326 | isa = XCBuildConfiguration; 327 | buildSettings = { 328 | ALWAYS_SEARCH_USER_PATHS = NO; 329 | ARCHS = "$(ARCHS_STANDARD_64_BIT)"; 330 | COPY_PHASE_STRIP = YES; 331 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 332 | GCC_C_LANGUAGE_STANDARD = gnu99; 333 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 334 | GCC_VERSION = com.apple.compilers.llvm.clang.1_0; 335 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 336 | GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES; 337 | GCC_WARN_ABOUT_RETURN_TYPE = YES; 338 | GCC_WARN_UNUSED_VARIABLE = YES; 339 | MACOSX_DEPLOYMENT_TARGET = 10.7; 340 | SDKROOT = macosx; 341 | }; 342 | name = Release; 343 | }; 344 | FAD5FC67144019540057FC2C /* Debug */ = { 345 | isa = XCBuildConfiguration; 346 | buildSettings = { 347 | DYLIB_COMPATIBILITY_VERSION = 1; 348 | DYLIB_CURRENT_VERSION = 1; 349 | FRAMEWORK_VERSION = A; 350 | GCC_PRECOMPILE_PREFIX_HEADER = YES; 351 | GCC_PREFIX_HEADER = "Hanzi2Pinyin/Hanzi2Pinyin-Prefix.pch"; 352 | INFOPLIST_FILE = "Hanzi2Pinyin/Hanzi2Pinyin-Info.plist"; 353 | INSTALL_PATH = "@executable_path/../Frameworks"; 354 | PRODUCT_NAME = "$(TARGET_NAME)"; 355 | WRAPPER_EXTENSION = framework; 356 | }; 357 | name = Debug; 358 | }; 359 | FAD5FC68144019540057FC2C /* Release */ = { 360 | isa = XCBuildConfiguration; 361 | buildSettings = { 362 | DYLIB_COMPATIBILITY_VERSION = 1; 363 | DYLIB_CURRENT_VERSION = 1; 364 | FRAMEWORK_VERSION = A; 365 | GCC_PRECOMPILE_PREFIX_HEADER = YES; 366 | GCC_PREFIX_HEADER = "Hanzi2Pinyin/Hanzi2Pinyin-Prefix.pch"; 367 | INFOPLIST_FILE = "Hanzi2Pinyin/Hanzi2Pinyin-Info.plist"; 368 | INSTALL_PATH = "@executable_path/../Frameworks"; 369 | PRODUCT_NAME = "$(TARGET_NAME)"; 370 | WRAPPER_EXTENSION = framework; 371 | }; 372 | name = Release; 373 | }; 374 | FAD5FC6A144019540057FC2C /* Debug */ = { 375 | isa = XCBuildConfiguration; 376 | buildSettings = { 377 | FRAMEWORK_SEARCH_PATHS = "$(DEVELOPER_LIBRARY_DIR)/Frameworks"; 378 | GCC_PRECOMPILE_PREFIX_HEADER = YES; 379 | GCC_PREFIX_HEADER = "Hanzi2Pinyin/Hanzi2Pinyin-Prefix.pch"; 380 | INFOPLIST_FILE = "Hanzi2PinyinTests/Hanzi2PinyinTests-Info.plist"; 381 | PRODUCT_NAME = "$(TARGET_NAME)"; 382 | WRAPPER_EXTENSION = octest; 383 | }; 384 | name = Debug; 385 | }; 386 | FAD5FC6B144019540057FC2C /* Release */ = { 387 | isa = XCBuildConfiguration; 388 | buildSettings = { 389 | FRAMEWORK_SEARCH_PATHS = "$(DEVELOPER_LIBRARY_DIR)/Frameworks"; 390 | GCC_PRECOMPILE_PREFIX_HEADER = YES; 391 | GCC_PREFIX_HEADER = "Hanzi2Pinyin/Hanzi2Pinyin-Prefix.pch"; 392 | INFOPLIST_FILE = "Hanzi2PinyinTests/Hanzi2PinyinTests-Info.plist"; 393 | PRODUCT_NAME = "$(TARGET_NAME)"; 394 | WRAPPER_EXTENSION = octest; 395 | }; 396 | name = Release; 397 | }; 398 | /* End XCBuildConfiguration section */ 399 | 400 | /* Begin XCConfigurationList section */ 401 | FAD5FC34144019540057FC2C /* Build configuration list for PBXProject "Hanzi2Pinyin" */ = { 402 | isa = XCConfigurationList; 403 | buildConfigurations = ( 404 | FAD5FC64144019540057FC2C /* Debug */, 405 | FAD5FC65144019540057FC2C /* Release */, 406 | ); 407 | defaultConfigurationIsVisible = 0; 408 | defaultConfigurationName = Release; 409 | }; 410 | FAD5FC66144019540057FC2C /* Build configuration list for PBXNativeTarget "Hanzi2Pinyin" */ = { 411 | isa = XCConfigurationList; 412 | buildConfigurations = ( 413 | FAD5FC67144019540057FC2C /* Debug */, 414 | FAD5FC68144019540057FC2C /* Release */, 415 | ); 416 | defaultConfigurationIsVisible = 0; 417 | defaultConfigurationName = Release; 418 | }; 419 | FAD5FC69144019540057FC2C /* Build configuration list for PBXNativeTarget "Hanzi2PinyinTests" */ = { 420 | isa = XCConfigurationList; 421 | buildConfigurations = ( 422 | FAD5FC6A144019540057FC2C /* Debug */, 423 | FAD5FC6B144019540057FC2C /* Release */, 424 | ); 425 | defaultConfigurationIsVisible = 0; 426 | defaultConfigurationName = Release; 427 | }; 428 | /* End XCConfigurationList section */ 429 | }; 430 | rootObject = FAD5FC31144019540057FC2C /* Project object */; 431 | } 432 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin/Hanzi2Pinyin-Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | English 7 | CFBundleExecutable 8 | ${EXECUTABLE_NAME} 9 | CFBundleIconFile 10 | 11 | CFBundleIdentifier 12 | info.chenyufei.${PRODUCT_NAME:rfc1034identifier} 13 | CFBundleInfoDictionaryVersion 14 | 6.0 15 | CFBundleName 16 | ${PRODUCT_NAME} 17 | CFBundlePackageType 18 | FMWK 19 | CFBundleShortVersionString 20 | 1.0 21 | CFBundleSignature 22 | ???? 23 | CFBundleVersion 24 | 1 25 | NSHumanReadableCopyright 26 | Copyright © 2011年 __MyCompanyName__. All rights reserved. 27 | NSPrincipalClass 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin/Hanzi2Pinyin-Prefix.pch: -------------------------------------------------------------------------------- 1 | // 2 | // Prefix header for all source files of the 'Hanzi2Pinyin' target in the 'Hanzi2Pinyin' project 3 | // 4 | 5 | #ifdef __OBJC__ 6 | #import 7 | #endif 8 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin/Hanzi2Pinyin.h: -------------------------------------------------------------------------------- 1 | // 2 | // Hanzi2Pinyin.h 3 | // Hanzi2Pinyin 4 | // 5 | // Created by Chen Yufei on 11-10-8. 6 | // Copyright 2011年 __MyCompanyName__. All rights reserved. 7 | // 8 | 9 | #import 10 | 11 | @interface Hanzi2Pinyin : NSObject 12 | 13 | // Retrun a new string with all Chinese characters convreted to pinyin. 14 | // For 2 consecutive characters, if one is Chinese, separater will be added 15 | // if the other is not space 16 | + (NSString *)convert:(NSString *)str separater:(NSString *)sep; 17 | 18 | + (NSString *)convert:(NSString *)str; 19 | 20 | // Convert each Chinese character to the first letter of its pinyin. 21 | + (NSString *)convertToAbbreviation:(NSString *)str; 22 | 23 | + (BOOL)hasChineseCharacter:(NSString *)str; 24 | 25 | @end 26 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin/Hanzi2Pinyin.m: -------------------------------------------------------------------------------- 1 | // 2 | // Hanzi2Pinyin.m 3 | // Hanzi2Pinyin 4 | // 5 | // Created by Chen Yufei on 11-10-8. 6 | // Copyright 2011年 __MyCompanyName__. All rights reserved. 7 | // 8 | // I want this Objecitv-C framework self contained and do not rely on C files. 9 | // So many of the code in this file is copied from c/hanzi-bindata.c 10 | 11 | #import "Hanzi2Pinyin.h" 12 | #import 13 | 14 | /* Obtained from data/Unihan/block1-pinyin-statistic.rb */ 15 | static NSString *pinyinTbl[] = { 16 | @"a", @"ai", @"an", @"ang", @"ao", @"ba", @"bai", @"ban", @"bang", @"bao", @"bei", 17 | @"ben", @"beng", @"bi", @"bian", @"biao", @"bie", @"bin", @"bing", @"bo", @"bu", @"ca", 18 | @"cai", @"can", @"cang", @"cao", @"ce", @"cen", @"ceng", @"cha", @"chai", @"chan", 19 | @"chang", @"chao", @"che", @"chen", @"cheng", @"chi", @"chong", @"chou", @"chu", 20 | @"chua", @"chuai", @"chuan", @"chuang", @"chui", @"chun", @"chuo", @"ci", @"cong", 21 | @"cou", @"cu", @"cuan", @"cui", @"cun", @"cuo", @"da", @"dai", @"dan", @"dang", @"dao", 22 | @"de", @"den", @"deng", @"di", @"dia", @"dian", @"diao", @"die", @"ding", @"diu", 23 | @"dong", @"dou", @"du", @"duan", @"dui", @"dun", @"duo", @"e", @"ei", @"en", @"eng", 24 | @"er", @"fa", @"fan", @"fang", @"fei", @"fen", @"feng", @"fiao", @"fo", @"fou", @"fu", 25 | @"ga", @"gai", @"gan", @"gang", @"gao", @"ge", @"gei", @"gen", @"geng", @"gong", 26 | @"gou", @"gu", @"gua", @"guai", @"guan", @"guang", @"gui", @"gun", @"guo", @"ha", 27 | @"hai", @"han", @"hang", @"hao", @"he", @"hei", @"hen", @"heng", @"hm", @"hong", 28 | @"hou", @"hu", @"hua", @"huai", @"huan", @"huang", @"hui", @"hun", @"huo", @"ji", 29 | @"jia", @"jian", @"jiang", @"jiao", @"jie", @"jin", @"jing", @"jiong", @"jiu", @"ju", 30 | @"juan", @"jue", @"jun", @"ka", @"kai", @"kan", @"kang", @"kao", @"ke", @"ken", 31 | @"keng", @"kong", @"kou", @"ku", @"kua", @"kuai", @"kuan", @"kuang", @"kui", @"kun", 32 | @"kuo", @"la", @"lai", @"lan", @"lang", @"lao", @"le", @"lei", @"leng", @"li", @"lia", 33 | @"lian", @"liang", @"liao", @"lie", @"lin", @"ling", @"liu", @"long", @"lou", @"lu", 34 | @"luan", @"lun", @"luo", @"lv", @"lve", @"m", @"ma", @"mai", @"man", @"mang", @"mao", 35 | @"me", @"mei", @"men", @"meng", @"mi", @"mian", @"miao", @"mie", @"min", @"ming", 36 | @"miu", @"mo", @"mou", @"mu", @"n", @"na", @"nai", @"nan", @"nang", @"nao", @"ne", 37 | @"nei", @"nen", @"neng", @"ni", @"nian", @"niang", @"niao", @"nie", @"nin", @"ning", 38 | @"niu", @"nong", @"nou", @"nu", @"nuan", @"nun", @"nuo", @"nv", @"nve", @"o", @"ou", 39 | @"pa", @"pai", @"pan", @"pang", @"pao", @"pei", @"pen", @"peng", @"pi", @"pian", 40 | @"piao", @"pie", @"pin", @"ping", @"po", @"pou", @"pu", @"qi", @"qia", @"qian", 41 | @"qiang", @"qiao", @"qie", @"qin", @"qing", @"qiong", @"qiu", @"qu", @"quan", @"que", 42 | @"qun", @"r", @"ran", @"rang", @"rao", @"re", @"ren", @"reng", @"ri", @"rong", @"rou", 43 | @"ru", @"rua", @"ruan", @"rui", @"run", @"ruo", @"sa", @"sai", @"san", @"sang", @"sao", 44 | @"se", @"sen", @"seng", @"sha", @"shai", @"shan", @"shang", @"shao", @"she", @"shen", 45 | @"sheng", @"shi", @"shou", @"shu", @"shua", @"shuai", @"shuan", @"shuang", @"shui", 46 | @"shun", @"shuo", @"si", @"song", @"sou", @"su", @"suan", @"sui", @"sun", @"suo", 47 | @"ta", @"tai", @"tan", @"tang", @"tao", @"te", @"teng", @"ti", @"tian", @"tiao", 48 | @"tie", @"ting", @"tong", @"tou", @"tu", @"tuan", @"tui", @"tun", @"tuo", @"wa", 49 | @"wai", @"wan", @"wang", @"wei", @"wen", @"weng", @"wo", @"wu", @"xi", @"xia", @"xian", 50 | @"xiang", @"xiao", @"xie", @"xin", @"xing", @"xiong", @"xiu", @"xu", @"xuan", @"xue", 51 | @"xun", @"ya", @"yan", @"yang", @"yao", @"ye", @"yi", @"yin", @"ying", @"yo", @"yong", 52 | @"you", @"yu", @"yuan", @"yue", @"yun", @"za", @"zai", @"zan", @"zang", @"zao", @"ze", 53 | @"zei", @"zen", @"zeng", @"zha", @"zhai", @"zhan", @"zhang", @"zhao", @"zhe", @"zhen", 54 | @"zheng", @"zhi", @"zhong", @"zhou", @"zhu", @"zhua", @"zhuai", @"zhuan", @"zhuang", 55 | @"zhui", @"zhun", @"zhuo", @"zi", @"zong", @"zou", @"zu", @"zuan", @"zui", @"zun", 56 | @"zuo" 57 | }; 58 | 59 | static uint16_t *pinyinData; 60 | 61 | /* Only block1 in Unihan is covered. */ 62 | static const uint32_t HANZI_START_CODEPOINT = 0x4E00; 63 | static const uint32_t HANZI_END_CODEPOINT = 0x9FFF; 64 | 65 | static inline int isHanzi(uint32_t cp) { 66 | return (HANZI_START_CODEPOINT <= cp && cp <= HANZI_END_CODEPOINT); 67 | } 68 | 69 | static BOOL needSeparater(uint32_t prevcp, uint32_t curcp) { 70 | BOOL r = NO; 71 | if (isHanzi(curcp)) { 72 | r = isspace(prevcp) ? false : true; 73 | } else if (isspace(curcp)) { 74 | r = false; 75 | } else { 76 | r = isHanzi(prevcp) ? true : false; 77 | } 78 | return r; 79 | } 80 | 81 | static NSString *pinyinFromCodepoint(int cp) { 82 | if (!pinyinData) 83 | return nil; 84 | if (isHanzi(cp)) { 85 | uint16_t id = pinyinData[cp - HANZI_START_CODEPOINT]; 86 | if (id == 0xFFFF) 87 | return nil; 88 | else 89 | return pinyinTbl[id]; 90 | } else 91 | return nil; 92 | } 93 | 94 | @implementation Hanzi2Pinyin 95 | 96 | + (void)initialize { 97 | NSBundle* bundle = [NSBundle bundleForClass:[Hanzi2Pinyin class]]; 98 | // NSLog(@"bundle path is %@", [bundle bundlePath]); 99 | NSString *data_path = [bundle pathForResource:@"pinyin" ofType:@"dat"]; 100 | // NSLog(@"pinyin.dat path %@", data_path); 101 | 102 | FILE *data_file = fopen([data_path UTF8String], "rb"); 103 | if (!data_file) { 104 | NSLog(@"Can't open pinyin data file"); 105 | return; 106 | } 107 | 108 | fseek(data_file, 0, SEEK_END); 109 | size_t length = ftell(data_file); 110 | fseek(data_file, 0, SEEK_SET); 111 | 112 | int fd = fileno(data_file); 113 | off_t offset = 0; 114 | pinyinData = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, offset); 115 | if (pinyinData == MAP_FAILED) { 116 | NSLog(@"Can't mmap pinyin data file"); 117 | fclose(data_file); 118 | pinyinData = nil; 119 | return; 120 | } 121 | } 122 | 123 | + (NSString *)convert:(NSString *)str separater:(NSString *)sep { 124 | // Convert to codepoint first 125 | const uint32_t *cp = (const uint32_t *)([str cStringUsingEncoding:NSUTF32LittleEndianStringEncoding]); 126 | NSUInteger length = [str length]; 127 | 128 | NSMutableString *pinyin = [NSMutableString stringWithCapacity:(length * 4)]; 129 | 130 | uint32_t prevcp = ' '; 131 | for (NSUInteger i = 0; i < length; i++) { 132 | NSString *py = pinyinFromCodepoint(cp[i]); 133 | NSString *appendStr = py; 134 | if (!py) { 135 | // No pinyin found, add the original character in the string 136 | NSRange range = {i, 1}; 137 | appendStr = [str substringWithRange:range]; 138 | } 139 | if (needSeparater(prevcp, cp[i])) { 140 | [pinyin appendFormat:@"%@%@", sep, appendStr]; 141 | } else { 142 | [pinyin appendString:appendStr]; 143 | } 144 | prevcp = cp[i]; 145 | } 146 | return [NSString stringWithString:pinyin]; 147 | } 148 | 149 | + (NSString *)convert:(NSString *)str { 150 | return [self convert:str separater:@" "]; 151 | } 152 | 153 | + (NSString *)convertToAbbreviation:(NSString *)str { 154 | // Convert to codepoint first 155 | const uint32_t *cp = (const uint32_t *)([str cStringUsingEncoding:NSUTF32LittleEndianStringEncoding]); 156 | NSUInteger length = [str length]; 157 | 158 | NSMutableString *abbrev = [NSMutableString stringWithCapacity:(length)]; 159 | 160 | for (NSUInteger i = 0; i < length; i++) { 161 | NSString *py = pinyinFromCodepoint(cp[i]); 162 | NSString *appendStr; 163 | if (py) { 164 | appendStr = [py substringToIndex:1]; 165 | } else { 166 | // No pinyin found, add the original character in the string 167 | NSRange range = {i, 1}; 168 | appendStr = [str substringWithRange:range]; 169 | } 170 | [abbrev appendString:appendStr]; 171 | } 172 | return [NSString stringWithString:abbrev]; 173 | } 174 | 175 | + (BOOL)hasChineseCharacter:(NSString *)str { 176 | const uint32_t *cp = (const uint32_t *)([str cStringUsingEncoding:NSUTF32LittleEndianStringEncoding]); 177 | NSUInteger length = [str length]; 178 | for (NSUInteger i = 0; i < length; i++) { 179 | if (isHanzi(cp[i])) 180 | return YES; 181 | } 182 | return NO; 183 | } 184 | 185 | @end 186 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin/en.lproj/InfoPlist.strings: -------------------------------------------------------------------------------- 1 | /* Localized versions of Info.plist keys */ 2 | 3 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2Pinyin/pinyin.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyfdecyf/hanzi2pinyin/e14b0f3ac03c65b7ef872bfc4983e2e3d683c162/objective-c/Hanzi2Pinyin/Hanzi2Pinyin/pinyin.dat -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2PinyinTests/Hanzi2PinyinTests-Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | en 7 | CFBundleExecutable 8 | ${EXECUTABLE_NAME} 9 | CFBundleIdentifier 10 | info.chenyufei.${PRODUCT_NAME:rfc1034identifier} 11 | CFBundleInfoDictionaryVersion 12 | 6.0 13 | CFBundlePackageType 14 | BNDL 15 | CFBundleShortVersionString 16 | 1.0 17 | CFBundleSignature 18 | ???? 19 | CFBundleVersion 20 | 1 21 | 22 | 23 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2PinyinTests/Hanzi2PinyinTests.h: -------------------------------------------------------------------------------- 1 | // 2 | // Hanzi2PinyinTests.h 3 | // Hanzi2PinyinTests 4 | // 5 | // Created by Chen Yufei on 11-10-8. 6 | // Copyright 2011年 __MyCompanyName__. All rights reserved. 7 | // 8 | 9 | #import 10 | 11 | @interface Hanzi2PinyinTests : SenTestCase 12 | 13 | @end 14 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2PinyinTests/Hanzi2PinyinTests.m: -------------------------------------------------------------------------------- 1 | // 2 | // Hanzi2PinyinTests.m 3 | // Hanzi2PinyinTests 4 | // 5 | // Created by Chen Yufei on 11-10-8. 6 | // Copyright 2011年 __MyCompanyName__. All rights reserved. 7 | // 8 | 9 | #import "Hanzi2PinyinTests.h" 10 | #import "Hanzi2Pinyin.h" 11 | 12 | @implementation Hanzi2PinyinTests 13 | 14 | - (void)setUp 15 | { 16 | [super setUp]; 17 | 18 | // Set-up code here. 19 | } 20 | 21 | - (void)tearDown 22 | { 23 | // Tear-down code here. 24 | 25 | [super tearDown]; 26 | } 27 | 28 | - (void)testConvert 29 | { 30 | NSString *py = [Hanzi2Pinyin convert:@"abc,love"]; 31 | STAssertTrue([py isEqualToString:@"abc,love"], @"ASCII char should not convert"); 32 | 33 | py = [Hanzi2Pinyin convert:@"你好"]; 34 | STAssertTrue([py isEqualToString:@"ni hao"], @"pinyin not correct, got %@", py); 35 | 36 | py = [Hanzi2Pinyin convert:@"欢迎 world"]; 37 | STAssertTrue([py isEqualToString:@"huan ying world"], @"pinyin not correct, got %@", py); 38 | 39 | py = [Hanzi2Pinyin convert:@"I欢迎world"]; 40 | STAssertTrue([py isEqualToString:@"I huan ying world"], @"pinyin not correct, got %@", py); 41 | 42 | py = [Hanzi2Pinyin convert:@"Steve Jobs,祝福"]; 43 | STAssertTrue([py isEqualToString:@"Steve Jobs, zhu fu"], @"pinyin not correct, got %@", py); 44 | 45 | py = [Hanzi2Pinyin convert:@"沈阳"]; 46 | STAssertTrue([py isEqualToString:@"shen yang"], @"pinyin not correct, got %@", py); 47 | } 48 | 49 | - (void)testConvertAbbreviation 50 | { 51 | NSString *py = [Hanzi2Pinyin convertToAbbreviation:@"abc,love"]; 52 | STAssertTrue([py isEqualToString:@"abc,love"], @"ASCII char should not convert"); 53 | 54 | py = [Hanzi2Pinyin convertToAbbreviation:@"你好"]; 55 | STAssertTrue([py isEqualToString:@"nh"], @"pinyin not correct, got %@", py); 56 | 57 | py = [Hanzi2Pinyin convertToAbbreviation:@"欢迎 world"]; 58 | STAssertTrue([py isEqualToString:@"hy world"], @"pinyin not correct, got %@", py); 59 | 60 | py = [Hanzi2Pinyin convertToAbbreviation:@"沈阳"]; 61 | STAssertTrue([py isEqualToString:@"sy"], @"pinyin not correct, got %@", py); 62 | } 63 | 64 | @end 65 | -------------------------------------------------------------------------------- /objective-c/Hanzi2Pinyin/Hanzi2PinyinTests/en.lproj/InfoPlist.strings: -------------------------------------------------------------------------------- 1 | /* Localized versions of Info.plist keys */ 2 | 3 | -------------------------------------------------------------------------------- /objective-c/README.md: -------------------------------------------------------------------------------- 1 | I'm not familiar with Objective-C. So is a very simple Objective-C 2 | implementation which mainly uses the C version. 3 | --------------------------------------------------------------------------------