├── KeyboardForm.cpp ├── KeyboardForm.h ├── Qt5Input.json ├── README.md ├── VirtualKeyboard.pro ├── VirtualKeyboard.pro.user ├── dict ├── dict_pinyin.dat └── pinyinEx.ini ├── font └── FontAwesome.otf ├── images ├── btn1.png ├── btn2.png ├── btn3.png └── change.png ├── inputcontext.cpp ├── inputcontext.h ├── keyeventdispatcher.cpp ├── keyeventdispatcher.h ├── pinyin ├── atomdictbase.h ├── dictbuilder.cpp ├── dictbuilder.h ├── dictdef.h ├── dictlist.cpp ├── dictlist.h ├── dicttrie.cpp ├── dicttrie.h ├── lpicache.cpp ├── lpicache.h ├── matrixsearch.cpp ├── matrixsearch.h ├── mystdlib.cpp ├── mystdlib.h ├── ngram.cpp ├── ngram.h ├── pinyinime.cpp ├── pinyinime.h ├── searchutility.cpp ├── searchutility.h ├── spellingtable.cpp ├── spellingtable.h ├── spellingtrie.cpp ├── spellingtrie.h ├── splparser.cpp ├── splparser.h ├── sync.cpp ├── sync.h ├── userdict.cpp ├── userdict.h ├── utf16char.cpp ├── utf16char.h ├── utf16reader.cpp └── utf16reader.h ├── pinyininputmethod.cpp ├── pinyininputmethod.h ├── platforminputcontextplugin.cpp ├── platforminputcontextplugin.h ├── res.qrc └── virtualkeyboard_global.h /KeyboardForm.cpp: -------------------------------------------------------------------------------- 1 | #include "KeyboardForm.h" 2 | #include 3 | #include 4 | #include 5 | 6 | const char *kbnumber = "1234567890"; 7 | const char *kbLetter = "qwertyuiopasdfghjklzxcvbnm"; 8 | const char *kbSymbol = "!@#$%^&*()-_+=[]{}|\\:;\"<>?"; 9 | const int verticalSpacing = 8; 10 | const int horizontalSpacing = 8; 11 | const int margins = 8; 12 | const int btnHeight = (250 - verticalSpacing*5)/5; 13 | const int btnWidth = (840 - margins*2 - horizontalSpacing*10)/10.8+0.5; 14 | 15 | KeyboardForm::KeyboardForm(QWidget *parent) : 16 | QDialog(parent) 17 | { 18 | setWindowModality(Qt::WindowModal); 19 | //setWindowFlags(windowFlags() | Qt::FramelessWindowHint | Qt::WindowStaysOnTopHint | Qt::WindowDoesNotAcceptFocus); 20 | setWindowFlags(windowFlags() | Qt::Tool | Qt::FramelessWindowHint | Qt::WindowStaysOnTopHint | Qt::WindowDoesNotAcceptFocus); 21 | 22 | setWindowOpacity(1); 23 | setAttribute(Qt::WA_TranslucentBackground); 24 | setStyleSheet("QLabel{font: bold normal 18px 'Consolas'} QWidget QPushButton{border-image: url(:/images/btn1.png); border-width: 5; font: bold normal } QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 25 | 26 | QFontDatabase::addApplicationFont(":/FontAwesome.otf"); 27 | QFont icofont("FontAwesome"); 28 | icofont.setPixelSize(32); 29 | 30 | QFont deffont("Consolas"); 31 | deffont.setPixelSize(24); 32 | 33 | m_inputMode = ImEn; 34 | m_isMousePress = false; 35 | m_isShiftInput = false; 36 | m_mainLayout = new QVBoxLayout; 37 | m_mainLayout->setMargin(margins); 38 | m_mainLayout->setSpacing(horizontalSpacing); 39 | 40 | // 显示拼音 lineEdit 41 | m_pyFrm = new QWidget; 42 | m_txtPinYin = new QLineEdit; 43 | m_labPyText = new QLabel; 44 | m_labPyText->hide(); 45 | m_labPyText->setStyleSheet("background: #a0ffffff; border-color: gray; border-width: 1px; border-style: solid; border-radius: 5px;"); 46 | QHBoxLayout *hl1 = new QHBoxLayout; 47 | QVBoxLayout *vl = new QVBoxLayout; 48 | QVBoxLayout *mvl = new QVBoxLayout; 49 | mvl->setMargin(0); 50 | mvl->setSpacing(0); 51 | hl1->setMargin(0); 52 | hl1->setSpacing(horizontalSpacing); 53 | hl1->addWidget(m_labPyText); 54 | hl1->addWidget(m_txtPinYin); 55 | hl1->addStretch(); 56 | m_txtPinYin->hide(); 57 | connect(m_txtPinYin, SIGNAL(textChanged(QString)), SLOT(setText(QString))); 58 | 59 | // 汉字 60 | QHBoxLayout *hl2 = new QHBoxLayout; 61 | hl2->setSpacing(0); 62 | for (int i = 0; i < 10; i++) 63 | { 64 | QPushButton *btn = new QPushButton; 65 | btn->setFont(deffont); 66 | btn->setStyleSheet("QPushButton{border: none; border-image: none; font-size: 18px} QPushButton:focus{color: #ff8000; padding: -20;}"); 67 | hl2->addWidget(btn); 68 | m_listHanzi.append(btn); 69 | if (i != 0 && i != 9) 70 | connect(btn, SIGNAL(clicked()), SLOT(slotHaiziBtnClicked())); 71 | } 72 | m_listHanzi[0]->setText("\xC2\xAB");// "\xC2\xBB" : "\xC2\xAB" 73 | m_listHanzi[0]->setEnabled(false); 74 | m_listHanzi[0]->setSizePolicy(QSizePolicy::Fixed, QSizePolicy::Fixed); 75 | m_listHanzi[9]->setText("\xC2\xBB"); 76 | m_listHanzi[9]->setEnabled(false); 77 | m_listHanzi[9]->setSizePolicy(QSizePolicy::Fixed, QSizePolicy::Fixed); 78 | connect(m_listHanzi[0], SIGNAL(clicked()), SLOT(prevPage())); 79 | connect(m_listHanzi[9], SIGNAL(clicked()), SLOT(nextPage())); 80 | vl->addStretch(); 81 | vl->addLayout(hl1); 82 | m_pyFrm->setLayout(hl2); 83 | //pyFrm->hide(); 84 | m_pyFrm->setStyleSheet("background:#FFFFFF"); 85 | vl->addWidget(m_pyFrm); 86 | mvl->addLayout(vl); 87 | 88 | // ==========================================数字========================================== 89 | QHBoxLayout *hl123 = new QHBoxLayout; 90 | hl123->setSpacing(horizontalSpacing); 91 | 92 | QPushButton *btnFan = new QPushButton("~"); 93 | btnFan->setFont(deffont); 94 | btnFan->setFixedSize(btnWidth*0.8, btnHeight); 95 | //btnFan->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 96 | connect(btnFan, SIGNAL(clicked()), SLOT(btnClicked())); 97 | hl123->addWidget(btnFan); 98 | hl123->setSpacing(horizontalSpacing); 99 | 100 | int idx = 0; 101 | for (int j = 0; j < 10; j++) 102 | { 103 | QPushButton *btn = new QPushButton(QString(QLatin1Char(kbnumber[idx++]))); 104 | btn->setFixedSize(btnWidth, btnHeight); 105 | btn->setFont(deffont); 106 | connect(btn, SIGNAL(clicked()), SLOT(btnClicked())); 107 | hl123->addWidget(btn); 108 | } 109 | m_mainLayout->addLayout(hl123); 110 | 111 | // ==========================================字母========================================== 112 | QList list; 113 | //=================== row 1 ======================= 114 | QHBoxLayout *hl3 = new QHBoxLayout; 115 | for (int j = 0, idx = 0; j < 10; j++) 116 | { 117 | QPushButton *btn = new QPushButton(QString(QLatin1Char(kbLetter[idx++]))); 118 | btn->setFixedSize(btnWidth, btnHeight); 119 | btn->setFont(deffont); 120 | connect(btn, SIGNAL(clicked()), SLOT(slotBtnClicked())); 121 | hl3->addWidget(btn); 122 | list.append(btn); 123 | } 124 | 125 | QPushButton *btnBack = new QPushButton("\uf060"); 126 | btnBack->setFont(icofont); 127 | btnBack->setFixedSize(btnWidth*0.8, btnHeight); 128 | btnBack->setAutoRepeat(true); 129 | btnBack->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 130 | connect(btnBack, SIGNAL(clicked()), SLOT(backSpace())); 131 | hl3->addWidget(btnBack); 132 | 133 | m_mainLayout->addLayout(hl3); 134 | m_listBtns.append(list); 135 | m_listCharsBtns.append(list); 136 | 137 | //=================== row 2 ======================= 138 | list.clear(); 139 | QHBoxLayout *hl6 = new QHBoxLayout; 140 | hl6->setSpacing(horizontalSpacing*1.2); 141 | hl6->addStretch(); 142 | for (int j = 0; j < 9; j++) 143 | { 144 | QPushButton *btn = new QPushButton(QString(QLatin1Char(kbLetter[idx++]))); 145 | btn->setFont(deffont); 146 | btn->setFixedSize(btnWidth, btnHeight); 147 | connect(btn, SIGNAL(clicked()), SLOT(slotBtnClicked())); 148 | hl6->addWidget(btn); 149 | list.append(btn); 150 | } 151 | QPushButton *btnEnter = new QPushButton("ENTER"); 152 | btnEnter->setFont(deffont); 153 | btnEnter->setFixedSize(btnWidth*1.5, btnHeight); 154 | btnEnter->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 155 | connect(btnEnter, SIGNAL(clicked()), SLOT(enter())); 156 | hl6->addWidget(btnEnter); 157 | hl6->addStretch(); 158 | m_mainLayout->addLayout(hl6); 159 | m_listBtns.append(list); 160 | m_listCharsBtns.append(list); 161 | 162 | list.clear(); 163 | QHBoxLayout *hl5 = new QHBoxLayout; 164 | hl5->setSpacing(horizontalSpacing); 165 | //=================== row 3 ======================= 166 | m_btnShift = new QPushButton("\uf062"); 167 | m_btnShift->setFont(icofont); 168 | m_btnShift->setFixedSize(btnWidth*0.8, btnHeight); 169 | m_btnShift->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 170 | connect(m_btnShift, SIGNAL(clicked()), SLOT(shiftClicked())); 171 | hl5->addWidget(m_btnShift); 172 | for (int j = 0; j < 7; j++) 173 | { 174 | QPushButton *btn = new QPushButton(QString(QLatin1Char(kbLetter[idx++]))); 175 | btn->setFont(deffont); 176 | btn->setFixedSize(btnWidth, btnHeight); 177 | connect(btn, SIGNAL(clicked()), SLOT(slotBtnClicked())); 178 | hl5->addWidget(btn); 179 | list.append(btn); 180 | } 181 | QPushButton* btn = new QPushButton(","); 182 | btn->setFont(deffont); 183 | btn->setFixedSize(btnWidth, btnHeight); 184 | connect(btn, SIGNAL(clicked()), SLOT(btnClicked())); 185 | hl5->addWidget(btn); 186 | 187 | btn = new QPushButton("."); 188 | btn->setFont(deffont); 189 | btn->setFixedSize(btnWidth, btnHeight); 190 | connect(btn, SIGNAL(clicked()), SLOT(btnClicked())); 191 | hl5->addWidget(btn); 192 | 193 | btn = new QPushButton("/"); 194 | btn->setFont(deffont); 195 | btn->setFixedSize(btnWidth, btnHeight); 196 | connect(btn, SIGNAL(clicked()), SLOT(btnClicked())); 197 | hl5->addWidget(btn); 198 | 199 | m_mainLayout->addLayout(hl5); 200 | m_listBtns.append(list); 201 | m_listCharsBtns.append(list); 202 | 203 | // ==========================================last========================================== 204 | QHBoxLayout *hl4 = new QHBoxLayout; 205 | hl4->setSpacing(horizontalSpacing); 206 | QList list2; 207 | 208 | m_btnSymbol = new QPushButton(".?123"); 209 | m_btnSymbol->setFont(deffont); 210 | m_btnSymbol->setFixedSize(btnWidth*1.5, btnHeight); 211 | m_btnSymbol->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 212 | connect(m_btnSymbol, SIGNAL(clicked()), SLOT(changeSymbol())); 213 | list2.append(m_btnSymbol); 214 | hl4->addWidget(m_btnSymbol); 215 | 216 | m_btnChange = new QPushButton("\uf0ac"); 217 | //btnChange->setIconSize(QSize(28, 28 )); 218 | m_btnChange->setFont(icofont); 219 | m_btnChange->setProperty("Mode", ImEn); 220 | m_btnChange->setFixedSize(btnWidth, btnHeight); 221 | m_btnChange->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 222 | connect(m_btnChange, SIGNAL(clicked()), SLOT(changeInputMode())); 223 | list2.append(m_btnChange); 224 | hl4->addWidget(m_btnChange); 225 | 226 | m_btnSpace = new QPushButton("英文"); // 空格 227 | m_btnSpace->setFont(deffont); 228 | m_btnSpace->setFixedHeight(btnHeight); 229 | connect(m_btnSpace, SIGNAL(clicked()), SLOT(space())); 230 | list2.append(m_btnSpace); 231 | hl4->addWidget(m_btnSpace); 232 | 233 | QPushButton *btnSplit = new QPushButton("'"); 234 | btnSplit->setFont(deffont); 235 | btnSplit->setFixedSize(btnWidth, btnHeight); 236 | connect(btnSplit, SIGNAL(clicked()), SLOT(slotBtnClicked())); 237 | list2.append(btnSplit); 238 | hl4->addWidget(btnSplit); 239 | 240 | QPushButton *btnHide = new QPushButton("\uf11c\uf103"); 241 | btnHide->setFont(icofont); 242 | btnHide->setFixedSize(btnWidth*1.5, btnHeight); 243 | btnHide->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 244 | connect(btnHide, SIGNAL(clicked()), SLOT(hide())); 245 | list2.append(btnHide); 246 | hl4->addWidget(btnHide); 247 | 248 | m_listBtns.append(list2); 249 | m_mainLayout->addLayout(hl4); 250 | m_btnFrm = new QWidget(); 251 | m_btnFrm->setLayout(m_mainLayout); 252 | m_btnFrm->setStyleSheet("background:#D1D4DA;"); 253 | mvl->addWidget(m_btnFrm); 254 | setLayout(mvl); 255 | connect(&m_pinyinInput, SIGNAL(pinyinTextChanged(QString)), SLOT(setText(QString))); 256 | m_hanziPageCnt = 0; 257 | m_pyFrm->setFixedHeight(40); 258 | setFixedHeight(m_pyFrm->height() + m_labPyText->height() + m_btnFrm->height()); 259 | } 260 | 261 | KeyboardForm::~KeyboardForm() 262 | { 263 | delete m_mainLayout; 264 | } 265 | 266 | void KeyboardForm::slotBtnClicked() 267 | { 268 | QString text = ((QPushButton*)sender())->text(); 269 | if (m_inputMode == ImNum || m_isShiftInput) 270 | sendKeyToFocusItem(((QPushButton*)sender())->text()); 271 | else 272 | { 273 | QString str = m_txtPinYin->text() + text; 274 | m_txtPinYin->setText(str); 275 | m_pinyinInput.Matching(str, m_inputMode == ImEn); 276 | if (m_pinyinInput.HanziModel.size()) 277 | { 278 | m_hanziPageCnt = m_pinyinInput.HanziModel.size() / 8 + int(m_pinyinInput.HanziModel.size() % 8 > 0); 279 | m_curHanziPage = 1; 280 | } 281 | else 282 | m_hanziPageCnt = 0; 283 | displayHanzi(); 284 | } 285 | } 286 | 287 | void KeyboardForm::btnClicked() 288 | { 289 | sendKeyToFocusItem(((QPushButton*)sender())->text()); 290 | } 291 | 292 | void KeyboardForm::slotHaiziBtnClicked() 293 | { 294 | sendKeyToFocusItem(((QPushButton*)sender())->text()); 295 | m_txtPinYin->clear(); 296 | m_labPyText->clear(); 297 | m_labPyText->hide(); 298 | m_hanziPageCnt = 0; 299 | displayHanzi(); 300 | } 301 | 302 | void KeyboardForm::setText(QString str) 303 | { 304 | m_labPyText->setVisible(str.size()); 305 | m_labPyText->setText(str); 306 | } 307 | 308 | void KeyboardForm::shiftClicked() 309 | { 310 | m_isShiftInput = !m_isShiftInput; 311 | m_inputMode = (InputMode)m_btnChange->property("Mode").toInt(); 312 | m_btnSpace->setText(m_inputMode == ImCn ? "拼音" : "英文"); 313 | m_btnSymbol->setText(".?123"); 314 | m_btnShift->setStyleSheet(tr("QPushButton{color:#FFFFFF; border-image: url(:/images/btn%1.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}").arg(m_isShiftInput?3:2)); 315 | m_hanziPageCnt = 0; 316 | m_txtPinYin->clear(); 317 | displayHanzi(); 318 | 319 | for (int i = 0, idx = 0; i < m_listCharsBtns.size(); i++) 320 | foreach (QPushButton* btn, m_listCharsBtns[i]) 321 | btn->setText(QString(QLatin1Char(m_isShiftInput ? kbLetter[idx] - 32 : kbLetter[idx]))), idx++; 322 | } 323 | 324 | void KeyboardForm::changeInputMode() 325 | { 326 | m_btnChange->setProperty("Mode", m_inputMode = m_btnChange->property("Mode").toInt() == ImEn ? ImCn : ImEn); 327 | //m_pyFrm->setVisible(true); 328 | m_btnSymbol->setText(".?123"); 329 | m_hanziPageCnt = 0; 330 | m_txtPinYin->clear(); 331 | displayHanzi(); 332 | m_btnSpace->setText(m_inputMode == ImCn ? "拼音" : "英文"); 333 | m_isShiftInput = false; 334 | m_btnShift->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 335 | for (int i = 0, idx = 0; i < m_listCharsBtns.size(); i++) 336 | foreach (QPushButton* btn, m_listCharsBtns[i]) 337 | btn->setText(QString(QLatin1Char(kbLetter[idx]))), idx++; 338 | } 339 | 340 | void KeyboardForm::changeSymbol() 341 | { 342 | if (m_btnSymbol->text() == ".?123") { 343 | m_btnSymbol->setText("返回"); 344 | m_hanziPageCnt = 0; 345 | m_txtPinYin->clear(); 346 | //m_pyFrm->hide(); 347 | m_isShiftInput = false; 348 | m_btnShift->setStyleSheet("QPushButton{color:#FFFFFF; border-image: url(:/images/btn2.png); border-width: 5;} QPushButton:pressed{border-image: url(:/images/btn3.png); border-width: 5;}"); 349 | displayHanzi(); 350 | if (m_inputMode == ImCn) 351 | m_listBtns.removeAt(0); 352 | m_inputMode = ImNum; 353 | m_btnSpace->setText("数字"); 354 | for (int i = 0, idx = -1; i < m_listCharsBtns.size(); i++) 355 | foreach (QPushButton* btn, m_listCharsBtns[i]) 356 | btn->setText(kbSymbol[++idx] == '&' ? "&&" : QString(QLatin1Char(kbSymbol[idx]))); 357 | } 358 | else { 359 | m_btnSymbol->setText(".?123"); 360 | m_inputMode = (InputMode)m_btnChange->property("Mode").toInt(); 361 | m_btnSpace->setText(m_inputMode == ImCn ? "拼音" : "英文"); 362 | for (int i = 0, idx = 0; i < m_listCharsBtns.size(); i++) 363 | foreach (QPushButton* btn, m_listCharsBtns[i]) 364 | btn->setText(QString(QLatin1Char(kbLetter[idx++]))); 365 | } 366 | } 367 | 368 | void KeyboardForm::space() 369 | { 370 | sendKeyToFocusItem(" "); 371 | } 372 | 373 | void KeyboardForm::enter() 374 | { 375 | if (m_txtPinYin->text().size()) 376 | { 377 | sendKeyToFocusItem(m_txtPinYin->text()); 378 | m_txtPinYin->clear(); 379 | m_labPyText->clear(); 380 | m_labPyText->hide(); 381 | } 382 | else 383 | sendKeyToFocusItem("\n"); 384 | } 385 | 386 | void KeyboardForm::backSpace() 387 | { 388 | if (m_inputMode == ImNum || m_isShiftInput) 389 | sendKeyToFocusItem("\x7F"); 390 | else 391 | { 392 | QString text = m_txtPinYin->text(); 393 | if (text.size()) 394 | { 395 | text.remove(text.size()-1, 1); 396 | m_txtPinYin->setText(text); 397 | m_pinyinInput.Matching(text, m_inputMode == ImEn); 398 | if (m_pinyinInput.HanziModel.size()) 399 | { 400 | m_hanziPageCnt = m_pinyinInput.HanziModel.size() / 8 + int(m_pinyinInput.HanziModel.size() % 8 > 0); 401 | m_curHanziPage = 1; 402 | } 403 | else 404 | m_hanziPageCnt = 0; 405 | displayHanzi(); 406 | } 407 | else 408 | { 409 | m_txtPinYin->clear(); 410 | m_labPyText->clear(); 411 | m_labPyText->hide(); 412 | sendKeyToFocusItem("\x7F"); 413 | } 414 | } 415 | } 416 | 417 | void KeyboardForm::mousePressEvent(QMouseEvent *e) 418 | { 419 | if(e->button() == Qt::LeftButton) 420 | { 421 | m_isMousePress = true; 422 | m_movePoint = e->pos(); 423 | } 424 | } 425 | 426 | void KeyboardForm::mouseMoveEvent(QMouseEvent *e) 427 | { 428 | if(m_isMousePress) 429 | { 430 | QPoint move_pos = e->globalPos(); 431 | this->move(move_pos - m_movePoint); 432 | } 433 | } 434 | 435 | void KeyboardForm::mouseReleaseEvent(QMouseEvent *e) 436 | { 437 | m_isMousePress = false; 438 | } 439 | 440 | void KeyboardForm::displayHanzi() 441 | { 442 | if (m_hanziPageCnt <= 0) 443 | { 444 | m_listHanzi[0]->setEnabled(false); 445 | m_listHanzi[9]->setEnabled(false); 446 | for (int i = 0; i < 8; i++) 447 | { 448 | m_listHanzi[i+1]->setText(""); 449 | m_listHanzi[i+1]->setEnabled(false); 450 | } 451 | } 452 | else if (m_hanziPageCnt == 1) 453 | { 454 | m_listHanzi[0]->setEnabled(false); 455 | m_listHanzi[9]->setEnabled(false); 456 | int len = m_pinyinInput.HanziModel.size(); 457 | for (int i = 0; i < len; i++) 458 | { 459 | m_listHanzi[i+1]->setText(m_pinyinInput.HanziModel[i]); 460 | m_listHanzi[i+1]->setEnabled(true); 461 | } 462 | for (int i = len; i < 8; i++) 463 | { 464 | m_listHanzi[i+1]->setText(""); 465 | m_listHanzi[i+1]->setEnabled(false); 466 | } 467 | } 468 | else if (m_curHanziPage == 1) 469 | { 470 | m_listHanzi[0]->setEnabled(false); 471 | m_listHanzi[9]->setEnabled(true); 472 | for (int i = 0; i < 8; i++) 473 | { 474 | m_listHanzi[i+1]->hide(); 475 | m_listHanzi[i+1]->setText(m_pinyinInput.HanziModel[i]); 476 | m_listHanzi[i+1]->setEnabled(true); 477 | m_listHanzi[i+1]->show(); 478 | } 479 | } 480 | else if (m_curHanziPage == m_hanziPageCnt) 481 | { 482 | m_listHanzi[0]->setEnabled(true); 483 | m_listHanzi[9]->setEnabled(false); 484 | int len = m_pinyinInput.HanziModel.size() % 8; 485 | int idx = (m_hanziPageCnt - 1) * 8; 486 | for (int i = 0; i < len; i++) 487 | { 488 | m_listHanzi[i+1]->setText(m_pinyinInput.HanziModel[idx + i]); 489 | m_listHanzi[i+1]->setEnabled(true); 490 | } 491 | for (int i = len; i < 8; i++) 492 | { 493 | m_listHanzi[i+1]->setText(""); 494 | m_listHanzi[i+1]->setEnabled(false); 495 | } 496 | } 497 | else 498 | { 499 | m_listHanzi[0]->setEnabled(true); 500 | m_listHanzi[9]->setEnabled(true); 501 | int idx = (m_curHanziPage - 1) * 8; 502 | for (int i = 0; i < 8; i++) 503 | { 504 | m_listHanzi[i+1]->setText(m_pinyinInput.HanziModel[idx + i]); 505 | m_listHanzi[i+1]->setEnabled(true); 506 | } 507 | } 508 | } 509 | 510 | void KeyboardForm::prevPage() 511 | { 512 | m_curHanziPage--; 513 | displayHanzi(); 514 | } 515 | 516 | void KeyboardForm::nextPage() 517 | { 518 | m_curHanziPage++; 519 | displayHanzi(); 520 | } 521 | -------------------------------------------------------------------------------- /KeyboardForm.h: -------------------------------------------------------------------------------- 1 | #ifndef KEYBOARDFROM_H 2 | #define KEYBOARDFROM_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "pinyininputmethod.h" 11 | 12 | class KeyboardForm : public QDialog 13 | { 14 | Q_OBJECT 15 | public: 16 | explicit KeyboardForm(QWidget *parent = 0); 17 | ~KeyboardForm(); 18 | 19 | signals: 20 | void sendKeyToFocusItem(const QString &keyText); 21 | 22 | public slots: 23 | void slotBtnClicked(); 24 | void btnClicked(); 25 | void slotHaiziBtnClicked(); 26 | void setText(QString str); 27 | void shiftClicked(); 28 | void changeInputMode(); 29 | void changeSymbol(); 30 | void space(); 31 | void enter(); 32 | void backSpace(); 33 | void prevPage(); 34 | void nextPage(); 35 | 36 | private: 37 | void mousePressEvent(QMouseEvent*e); 38 | void mouseMoveEvent(QMouseEvent*e); 39 | void mouseReleaseEvent(QMouseEvent*e); 40 | void displayHanzi(); 41 | 42 | private: 43 | QWidget* m_pyFrm, *m_btnFrm; // 拼音面板 44 | QLineEdit *m_txtPinYin; // 真实拼音 45 | QLabel *m_labPyText; // 分割拼音 46 | QList m_listHanzi; 47 | QList >m_listCharsBtns; // A-Z Symbol 48 | QList >m_listBtns; 49 | QVBoxLayout *m_mainLayout; 50 | QPushButton *m_btnChange, *m_btnShift, *m_btnSymbol, *m_btnSpace; // 显示切换输入法 51 | CPinyinInputMethod m_pinyinInput; 52 | 53 | enum InputMode{ImEn, ImCn, ImNum}; 54 | InputMode m_inputMode; 55 | bool m_isShiftInput; 56 | int m_hanziPageCnt, m_curHanziPage; 57 | bool m_isMousePress; 58 | QPoint m_movePoint; 59 | }; 60 | #endif // KEYBOARDFROM_H 61 | -------------------------------------------------------------------------------- /Qt5Input.json: -------------------------------------------------------------------------------- 1 | { 2 | "Keys": [ "Qt5Input" ] 3 | } 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VirtualKeyboard Qt5虚拟键盘 2 | 支持Qt版本:Qt5.0.x ~ Qt5.4.x 3 | 使用说明: 4 | 1、先编译 VirtualKeyboard 源代码,生成 VirtualKeyboard.so(.dll); 5 | 2、假设要使用输入法的程序是Qt5Demo.exe; 6 | 3、在Qt5Demo.exe所在目录新建一个文件夹 platforminputcontexts (文件夹名称不可更改成其他), 7 | 将 VirtualKeyboard.so(.dll) 放入其中; 8 | 4、将代码文件夹中 dict 整个文件夹复制到Qt5Demo.exe所在目录; 9 | 5、最后运行 Qt5Demo.exe 即可。 10 | 11 | 20161209 Qt5.0 ~ Qt5.5 可以用。邓艾更新 -------------------------------------------------------------------------------- /VirtualKeyboard.pro: -------------------------------------------------------------------------------- 1 | #------------------------------------------------- 2 | # 3 | # Project created by QtCreator 2013-04-04T23:11:38 4 | # 5 | #------------------------------------------------- 6 | 7 | QT += gui-private widgets xml sql multimedia 8 | 9 | CONFIG += plugin 10 | 11 | TARGET = VirtualKeyboard 12 | TEMPLATE = lib 13 | 14 | INCLUDEPATH += $$PWD/pinyin 15 | 16 | 17 | DEFINES += VIRTUALKEYBOARD_LIBRARY 18 | 19 | win32 { 20 | DESTDIR = C:/CL/Qt.reference.2016.VirtualKeyboard/build/platforminputcontexts 21 | OBJECTS_DIR = $$DESTDIR/obj 22 | MOC_DIR = $$DESTDIR/moc 23 | } 24 | 25 | SOURCES += \ 26 | KeyboardForm.cpp \ 27 | pinyininputmethod.cpp \ 28 | pinyin/dictbuilder.cpp \ 29 | pinyin/dictlist.cpp \ 30 | pinyin/dicttrie.cpp \ 31 | pinyin/lpicache.cpp \ 32 | pinyin/matrixsearch.cpp \ 33 | pinyin/mystdlib.cpp \ 34 | pinyin/ngram.cpp \ 35 | pinyin/pinyinime.cpp \ 36 | pinyin/searchutility.cpp \ 37 | pinyin/spellingtable.cpp \ 38 | pinyin/spellingtrie.cpp \ 39 | pinyin/splparser.cpp \ 40 | pinyin/sync.cpp \ 41 | pinyin/userdict.cpp \ 42 | pinyin/utf16char.cpp \ 43 | pinyin/utf16reader.cpp \ 44 | inputcontext.cpp \ 45 | keyeventdispatcher.cpp \ 46 | platforminputcontextplugin.cpp 47 | 48 | HEADERS +=\ 49 | KeyboardForm.h \ 50 | pinyininputmethod.h \ 51 | pinyin/atomdictbase.h \ 52 | pinyin/dictbuilder.h \ 53 | pinyin/dictdef.h \ 54 | pinyin/dictlist.h \ 55 | pinyin/dicttrie.h \ 56 | pinyin/lpicache.h \ 57 | pinyin/matrixsearch.h \ 58 | pinyin/mystdlib.h \ 59 | pinyin/ngram.h \ 60 | pinyin/pinyinime.h \ 61 | pinyin/searchutility.h \ 62 | pinyin/spellingtable.h \ 63 | pinyin/spellingtrie.h \ 64 | pinyin/splparser.h \ 65 | pinyin/sync.h \ 66 | pinyin/userdict.h \ 67 | pinyin/utf16char.h \ 68 | pinyin/utf16reader.h \ 69 | inputcontext.h \ 70 | platforminputcontextplugin.h \ 71 | keyeventdispatcher.h \ 72 | virtualkeyboard_global.h 73 | 74 | RESOURCES += \ 75 | res.qrc 76 | 77 | -------------------------------------------------------------------------------- /dict/dict_pinyin.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linbin823/VirtualKeyboard/fd2fa655b4f490808853b45a6b676987366df5a1/dict/dict_pinyin.dat -------------------------------------------------------------------------------- /font/FontAwesome.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linbin823/VirtualKeyboard/fd2fa655b4f490808853b45a6b676987366df5a1/font/FontAwesome.otf -------------------------------------------------------------------------------- /images/btn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linbin823/VirtualKeyboard/fd2fa655b4f490808853b45a6b676987366df5a1/images/btn1.png -------------------------------------------------------------------------------- /images/btn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linbin823/VirtualKeyboard/fd2fa655b4f490808853b45a6b676987366df5a1/images/btn2.png -------------------------------------------------------------------------------- /images/btn3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linbin823/VirtualKeyboard/fd2fa655b4f490808853b45a6b676987366df5a1/images/btn3.png -------------------------------------------------------------------------------- /images/change.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linbin823/VirtualKeyboard/fd2fa655b4f490808853b45a6b676987366df5a1/images/change.png -------------------------------------------------------------------------------- /inputcontext.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "inputcontext.h" 8 | #include "pinyininputmethod.h" 9 | 10 | #define WINDOW_HEIGHT QApplication::desktop()->height() 11 | static InputContext *instance = NULL; 12 | 13 | InputContextPrivate::InputContextPrivate() : KeyBoard(0) 14 | { 15 | } 16 | 17 | InputContextPrivate::~InputContextPrivate() 18 | { 19 | if (KeyBoard) 20 | delete KeyBoard; 21 | } 22 | 23 | InputContext* InputContext::Instance() 24 | { 25 | return instance; 26 | } 27 | 28 | InputContext::InputContext() : 29 | QPlatformInputContext() 30 | { 31 | d = new InputContextPrivate(); 32 | instance = this; 33 | } 34 | 35 | InputContext::~InputContext() 36 | { 37 | } 38 | 39 | bool InputContext::isValid() const 40 | { 41 | return true; 42 | } 43 | 44 | QRectF InputContext::keyboardRect() const 45 | { 46 | return (d->KeyBoard) ? d->KeyBoard->rect() : QRectF(); 47 | } 48 | 49 | void InputContext::showInputPanel() 50 | { 51 | if (!d->KeyBoard) { 52 | d->KeyBoard = new KeyboardForm(); 53 | connect(d->KeyBoard, SIGNAL(sendKeyToFocusItem(QString)), &d->keyEventDispatcher, SLOT(sendKeyToFocusItem(QString))); 54 | } 55 | d->KeyBoard->show(); 56 | d->KeyBoard->move(d->KeyBoard->x(), WINDOW_HEIGHT - d->KeyBoard->height()); 57 | 58 | QPlatformInputContext::showInputPanel(); 59 | } 60 | 61 | void InputContext::hideInputPanel() 62 | { 63 | if (d->KeyBoard) 64 | d->KeyBoard->hide(); 65 | QPlatformInputContext::hideInputPanel(); 66 | } 67 | 68 | bool InputContext::isInputPanelVisible() const 69 | { 70 | return (d->KeyBoard) ? d->KeyBoard->isVisible() : false; 71 | } 72 | 73 | 74 | void InputContext::setFocusObject(QObject *object) 75 | { 76 | d->keyEventDispatcher.setFocusItem(object); 77 | } 78 | -------------------------------------------------------------------------------- /inputcontext.h: -------------------------------------------------------------------------------- 1 | #ifndef MOCKUPINPUTCONTEXT_H 2 | #define MOCKUPINPUTCONTEXT_H 3 | #include 4 | #include 5 | #include "keyeventdispatcher.h" 6 | #include "KeyboardForm.h" 7 | 8 | class InputContextPrivate; 9 | class InputContext : public QPlatformInputContext 10 | { 11 | Q_OBJECT 12 | public: 13 | InputContext(); 14 | ~InputContext(); 15 | //retur true if plugin is enabled 16 | bool isValid() const; 17 | 18 | //this value will be available in QGuiApplication::inputMethod()->keyboardRectangle() 19 | QRectF keyboardRect() const; 20 | 21 | //this value will be available in QGuiApplication::inputMethod()->isVisible() 22 | bool isInputPanelVisible() const; 23 | 24 | //editor pointer 25 | void setFocusObject(QObject *object); 26 | static InputContext* Instance(); 27 | public slots: 28 | void hideInputPanel(); 29 | //show and hide invoked by Qt when editor gets focus 30 | void showInputPanel(); 31 | public: 32 | InputContextPrivate *d; 33 | }; 34 | 35 | class InputContextPrivate 36 | { 37 | public: 38 | InputContextPrivate(); 39 | ~InputContextPrivate(); 40 | KeyboardForm *KeyBoard; 41 | KeyEventDispatcher keyEventDispatcher; 42 | }; 43 | #endif // MOCKUPINPUTCONTEXT_H 44 | -------------------------------------------------------------------------------- /keyeventdispatcher.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "keyeventdispatcher.h" 5 | 6 | KeyEventDispatcher::KeyEventDispatcher(QObject *parent) : 7 | QObject(parent),m_focusItem(0) 8 | { 9 | } 10 | 11 | void KeyEventDispatcher::setFocusItem(QObject *focusItem) 12 | { 13 | m_focusItem = focusItem; 14 | } 15 | 16 | void KeyEventDispatcher::sendKeyToFocusItem(const QString &keyText) 17 | { 18 | if (!m_focusItem) { 19 | return; 20 | } 21 | if (keyText == QString("\x7F")) 22 | { 23 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, Qt::Key_Backspace, Qt::NoModifier)); 24 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_Backspace, Qt::NoModifier)); 25 | } 26 | else if (keyText == QString("\n")) 27 | { 28 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, Qt::Key_Return, Qt::NoModifier)); 29 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_Return, Qt::NoModifier)); 30 | } 31 | else if (keyText == QString("&&")) 32 | { 33 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, 0, Qt::NoModifier, "&")); 34 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, 0, Qt::NoModifier, "&")); 35 | } 36 | else if (keyText == QString("&CtrlC&")) 37 | { 38 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, Qt::Key_C, Qt::ControlModifier)); 39 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_C, Qt::ControlModifier)); 40 | } 41 | else if (keyText == QString("&CtrlV&")) 42 | { 43 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, Qt::Key_V, Qt::ControlModifier)); 44 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_V, Qt::ControlModifier)); 45 | } 46 | else if (keyText == QString("&Left&")) 47 | { 48 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, Qt::Key_Left, Qt::NoModifier)); 49 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_Left, Qt::NoModifier)); 50 | } 51 | else if (keyText == QString("&CtrlA&")) 52 | { 53 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, Qt::Key_A, Qt::ControlModifier)); 54 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, Qt::Key_A, Qt::ControlModifier)); 55 | } 56 | else 57 | { 58 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyPress, 0, Qt::NoModifier, keyText)); 59 | QCoreApplication::sendEvent(m_focusItem, new QKeyEvent(QEvent::KeyRelease, 0, Qt::NoModifier, keyText)); 60 | } 61 | } -------------------------------------------------------------------------------- /keyeventdispatcher.h: -------------------------------------------------------------------------------- 1 | #ifndef MOCKUPKEYEVENTDISPATCHER_H 2 | #define MOCKUPKEYEVENTDISPATCHER_H 3 | 4 | #include 5 | 6 | class KeyEventDispatcher : public QObject 7 | { 8 | Q_OBJECT 9 | public: 10 | explicit KeyEventDispatcher(QObject *parent = 0); 11 | signals: 12 | 13 | public slots: 14 | QObject * getFocusItem(){return m_focusItem;} 15 | void setFocusItem(QObject *focusItem); 16 | void sendKeyToFocusItem(const QString &keyText); 17 | private: 18 | QObject * m_focusItem; 19 | }; 20 | 21 | #endif // MOCKUPKEYEVENTDISPATCHER_H 22 | -------------------------------------------------------------------------------- /pinyin/atomdictbase.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * This class defines AtomDictBase class which is the base class for all atom 19 | * dictionaries. Atom dictionaries are managed by the decoder class 20 | * MatrixSearch. 21 | * 22 | * When the user appends a new character to the Pinyin string, all enabled atom 23 | * dictionaries' extend_dict() will be called at least once to get candidates 24 | * ended in this step (the information of starting step is also given in the 25 | * parameter). Usually, when extend_dict() is called, a MileStoneHandle object 26 | * returned by a previous calling for a earlier step is given to speed up the 27 | * look-up process, and a new MileStoneHandle object will be returned if 28 | * the extension is successful. 29 | * 30 | * A returned MileStoneHandle object should keep alive until Function 31 | * reset_milestones() is called and this object is noticed to be reset. 32 | * 33 | * Usually, the atom dictionary can use step information to manage its 34 | * MileStoneHandle objects, or it can make the objects in ascendant order to 35 | * make the reset easier. 36 | * 37 | * When the decoder loads the dictionary, it will give a starting lemma id for 38 | * this atom dictionary to map a inner id to a global id. Global ids should be 39 | * used when an atom dictionary talks to any component outside. 40 | */ 41 | #ifndef PINYINIME_INCLUDE_ATOMDICTBASE_H__ 42 | #define PINYINIME_INCLUDE_ATOMDICTBASE_H__ 43 | 44 | #include 45 | #include "./dictdef.h" 46 | #include "./searchutility.h" 47 | 48 | namespace ime_pinyin { 49 | class AtomDictBase { 50 | public: 51 | virtual ~AtomDictBase() {} 52 | 53 | /** 54 | * Load an atom dictionary from a file. 55 | * 56 | * @param file_name The file name to load dictionary. 57 | * @param start_id The starting id used for this atom dictionary. 58 | * @param end_id The end id (included) which can be used for this atom 59 | * dictionary. User dictionary will always use the last id space, so it can 60 | * ignore this paramter. All other atom dictionaries should check this 61 | * parameter. 62 | * @return True if succeed. 63 | */ 64 | virtual bool load_dict(const char *file_name, LemmaIdType start_id, 65 | LemmaIdType end_id) = 0; 66 | 67 | /** 68 | * Close this atom dictionary. 69 | * 70 | * @return True if succeed. 71 | */ 72 | virtual bool close_dict() = 0; 73 | 74 | /** 75 | * Get the total number of lemmas in this atom dictionary. 76 | * 77 | * @return The total number of lemmas. 78 | */ 79 | virtual size_t number_of_lemmas() = 0; 80 | 81 | /** 82 | * This function is called by the decoder when user deletes a character from 83 | * the input string, or begins a new input string. 84 | * 85 | * Different atom dictionaries may implement this function in different way. 86 | * an atom dictionary can use one of these two parameters (or both) to reset 87 | * its corresponding MileStoneHandle objects according its detailed 88 | * implementation. 89 | * 90 | * For example, if an atom dictionary uses step information to manage its 91 | * MileStoneHandle objects, parameter from_step can be used to identify which 92 | * objects should be reset; otherwise, if another atom dictionary does not 93 | * use the detailed step information, it only uses ascendant handles 94 | * (according to step. For the same step, earlier call, smaller handle), it 95 | * can easily reset those MileStoneHandle which are larger than from_handle. 96 | * 97 | * The decoder always reset the decoding state by step. So when it begins 98 | * resetting, it will call reset_milestones() of its atom dictionaries with 99 | * the step information, and the MileStoneHandle objects returned by the 100 | * earliest calling of extend_dict() for that step. 101 | * 102 | * If an atom dictionary does not implement incremental search, this function 103 | * can be totally ignored. 104 | * 105 | * @param from_step From which step(included) the MileStoneHandle 106 | * objects should be reset. 107 | * @param from_handle The ealiest MileStoneHandle object for step from_step 108 | */ 109 | virtual void reset_milestones(uint16 from_step, 110 | MileStoneHandle from_handle) = 0; 111 | 112 | /** 113 | * Used to extend in this dictionary. The handle returned should keep valid 114 | * until reset_milestones() is called. 115 | * 116 | * @param from_handle Its previous returned extended handle without the new 117 | * spelling id, it can be used to speed up the extending. 118 | * @param dep The paramter used for extending. 119 | * @param lpi_items Used to fill in the lemmas matched. 120 | * @param lpi_max The length of the buffer 121 | * @param lpi_num Used to return the newly added items. 122 | * @return The new mile stone for this extending. 0 if fail. 123 | */ 124 | virtual MileStoneHandle extend_dict(MileStoneHandle from_handle, 125 | const DictExtPara *dep, 126 | LmaPsbItem *lpi_items, 127 | size_t lpi_max, size_t *lpi_num) = 0; 128 | 129 | /** 130 | * Get lemma items with scores according to a spelling id stream. 131 | * This atom dictionary does not need to sort the returned items. 132 | * 133 | * @param splid_str The spelling id stream buffer. 134 | * @param splid_str_len The length of the spelling id stream buffer. 135 | * @param lpi_items Used to return matched lemma items with scores. 136 | * @param lpi_max The maximum size of the buffer to return result. 137 | * @return The number of matched items which have been filled in to lpi_items. 138 | */ 139 | virtual size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 140 | LmaPsbItem *lpi_items, size_t lpi_max) = 0; 141 | 142 | /** 143 | * Get a lemma string (The Chinese string) by the given lemma id. 144 | * 145 | * @param id_lemma The lemma id to get the string. 146 | * @param str_buf The buffer to return the Chinese string. 147 | * @param str_max The maximum size of the buffer. 148 | * @return The length of the string, 0 if fail. 149 | */ 150 | virtual uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, 151 | uint16 str_max) = 0; 152 | 153 | /** 154 | * Get the full spelling ids for the given lemma id. 155 | * If the given buffer is too short, return 0. 156 | * 157 | * @param splids Used to return the spelling ids. 158 | * @param splids_max The maximum buffer length of splids. 159 | * @param arg_valid Used to indicate if the incoming parameters have been 160 | * initialized are valid. If it is true, the splids and splids_max are valid 161 | * and there may be half ids in splids to be updated to full ids. In this 162 | * case, splids_max is the number of valid ids in splids. 163 | * @return The number of ids in the buffer. 164 | */ 165 | virtual uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 166 | uint16 splids_max, bool arg_valid) = 0; 167 | 168 | /** 169 | * Function used for prediction. 170 | * No need to sort the newly added items. 171 | * 172 | * @param last_hzs The last n Chinese chracters(called Hanzi), its length 173 | * should be less than or equal to kMaxPredictSize. 174 | * @param hzs_len specifies the length(<= kMaxPredictSize) of the history. 175 | * @param npre_items Used used to return the result. 176 | * @param npre_max The length of the buffer to return result 177 | * @param b4_used Number of prediction result (from npre_items[-b4_used]) 178 | * from other atom dictionaries. A atom ditionary can just ignore it. 179 | * @return The number of prediction result from this atom dictionary. 180 | */ 181 | virtual size_t predict(const char16 last_hzs[], uint16 hzs_len, 182 | NPredictItem *npre_items, size_t npre_max, 183 | size_t b4_used) = 0; 184 | 185 | /** 186 | * Add a lemma to the dictionary. If the dictionary allows to add new 187 | * items and this item does not exist, add it. 188 | * 189 | * @param lemma_str The Chinese string of the lemma. 190 | * @param splids The spelling ids of the lemma. 191 | * @param lemma_len The length of the Chinese lemma. 192 | * @param count The frequency count for this lemma. 193 | */ 194 | virtual LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], 195 | uint16 lemma_len, uint16 count) = 0; 196 | 197 | /** 198 | * Update a lemma's occuring count. 199 | * 200 | * @param lemma_id The lemma id to update. 201 | * @param delta_count The frequnecy count to ajust. 202 | * @param selected Indicate whether this lemma is selected by user and 203 | * submitted to target edit box. 204 | * @return The id if succeed, 0 if fail. 205 | */ 206 | virtual LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, 207 | bool selected) = 0; 208 | 209 | /** 210 | * Get the lemma id for the given lemma. 211 | * 212 | * @param lemma_str The Chinese string of the lemma. 213 | * @param splids The spelling ids of the lemma. 214 | * @param lemma_len The length of the lemma. 215 | * @return The matched lemma id, or 0 if fail. 216 | */ 217 | virtual LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], 218 | uint16 lemma_len) = 0; 219 | 220 | /** 221 | * Get the lemma score. 222 | * 223 | * @param lemma_id The lemma id to get score. 224 | * @return The score of the lemma, or 0 if fail. 225 | */ 226 | virtual LmaScoreType get_lemma_score(LemmaIdType lemma_id) = 0; 227 | 228 | /** 229 | * Get the lemma score. 230 | * 231 | * @param lemma_str The Chinese string of the lemma. 232 | * @param splids The spelling ids of the lemma. 233 | * @param lemma_len The length of the lemma. 234 | * @return The score of the lamm, or 0 if fail. 235 | */ 236 | virtual LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], 237 | uint16 lemma_len) = 0; 238 | 239 | /** 240 | * If the dictionary allowed, remove a lemma from it. 241 | * 242 | * @param lemma_id The id of the lemma to remove. 243 | * @return True if succeed. 244 | */ 245 | virtual bool remove_lemma(LemmaIdType lemma_id) = 0; 246 | 247 | /** 248 | * Get the total occuring count of this atom dictionary. 249 | * 250 | * @return The total occuring count of this atom dictionary. 251 | */ 252 | virtual size_t get_total_lemma_count() = 0; 253 | 254 | /** 255 | * Set the total occuring count of other atom dictionaries. 256 | * 257 | * @param count The total occuring count of other atom dictionaies. 258 | */ 259 | virtual void set_total_lemma_count_of_others(size_t count) = 0; 260 | 261 | /** 262 | * Notify this atom dictionary to flush the cached data to persistent storage 263 | * if necessary. 264 | */ 265 | virtual void flush_cache() = 0; 266 | }; 267 | } 268 | 269 | #endif // PINYINIME_INCLUDE_ATOMDICTBASE_H__ 270 | -------------------------------------------------------------------------------- /pinyin/dictbuilder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_DICTBUILDER_H__ 18 | #define PINYINIME_INCLUDE_DICTBUILDER_H__ 19 | 20 | #include 21 | #include "./utf16char.h" 22 | #include "./dictdef.h" 23 | #include "./dictlist.h" 24 | #include "./spellingtable.h" 25 | #include "./spellingtrie.h" 26 | #include "./splparser.h" 27 | 28 | namespace ime_pinyin { 29 | 30 | #ifdef ___BUILD_MODEL___ 31 | 32 | #define ___DO_STATISTICS___ 33 | 34 | class DictTrie; 35 | 36 | class DictBuilder { 37 | private: 38 | // The raw lemma array buffer. 39 | LemmaEntry *lemma_arr_; 40 | size_t lemma_num_; 41 | 42 | // Used to store all possible single char items. 43 | // Two items may have the same Hanzi while their spelling ids are different. 44 | SingleCharItem *scis_; 45 | size_t scis_num_; 46 | 47 | // In the tree, root's level is -1. 48 | // Lemma nodes for root, and level 0 49 | LmaNodeLE0 *lma_nodes_le0_; 50 | 51 | // Lemma nodes for layers whose levels are deeper than 0 52 | LmaNodeGE1 *lma_nodes_ge1_; 53 | 54 | // Number of used lemma nodes 55 | size_t lma_nds_used_num_le0_; 56 | size_t lma_nds_used_num_ge1_; 57 | 58 | // Used to store homophonies' ids. 59 | LemmaIdType *homo_idx_buf_; 60 | // Number of homophonies each of which only contains one Chinese character. 61 | size_t homo_idx_num_eq1_; 62 | // Number of homophonies each of which contains more than one character. 63 | size_t homo_idx_num_gt1_; 64 | 65 | // The items with highest scores. 66 | LemmaEntry *top_lmas_; 67 | size_t top_lmas_num_; 68 | 69 | SpellingTable *spl_table_; 70 | SpellingParser *spl_parser_; 71 | 72 | #ifdef ___DO_STATISTICS___ 73 | size_t max_sonbuf_len_[kMaxLemmaSize]; 74 | size_t max_homobuf_len_[kMaxLemmaSize]; 75 | 76 | size_t total_son_num_[kMaxLemmaSize]; 77 | size_t total_node_hasson_[kMaxLemmaSize]; 78 | size_t total_sonbuf_num_[kMaxLemmaSize]; 79 | size_t total_sonbuf_allnoson_[kMaxLemmaSize]; 80 | size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize]; 81 | size_t total_homo_num_[kMaxLemmaSize]; 82 | 83 | size_t sonbufs_num1_; // Number of son buffer with only 1 son 84 | size_t sonbufs_numgt1_; // Number of son buffer with more 1 son; 85 | 86 | size_t total_lma_node_num_; 87 | 88 | void stat_init(); 89 | void stat_print(); 90 | #endif 91 | 92 | public: 93 | 94 | DictBuilder(); 95 | ~DictBuilder(); 96 | 97 | // Build dictionary trie from the file fn_raw. File fn_validhzs provides 98 | // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be 99 | // included. 100 | bool build_dict(const char* fn_raw, const char* fn_validhzs, 101 | DictTrie *dict_trie); 102 | 103 | private: 104 | // Fill in the buffer with id. The caller guarantees that the paramters are 105 | // vaild. 106 | void id_to_charbuf(unsigned char *buf, LemmaIdType id); 107 | 108 | // Update the offset of sons for a node. 109 | void set_son_offset(LmaNodeGE1 *node, size_t offset); 110 | 111 | // Update the offset of homophonies' ids for a node. 112 | void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset); 113 | 114 | // Format a speling string. 115 | void format_spelling_str(char *spl_str); 116 | 117 | // Sort the lemma_arr by the hanzi string, and give each of unique items 118 | // a id. Why we need to sort the lemma list according to their Hanzi string 119 | // is to find items started by a given prefix string to do prediction. 120 | // Actually, the single char items are be in other order, for example, 121 | // in spelling id order, etc. 122 | // Return value is next un-allocated idx available. 123 | LemmaIdType sort_lemmas_by_hz(); 124 | 125 | // Build the SingleCharItem list, and fill the hanzi_scis_ids in the 126 | // lemma buffer lemma_arr_. 127 | // This function should be called after the lemma array is ready. 128 | // Return the number of unique SingleCharItem elements. 129 | size_t build_scis(); 130 | 131 | // Construct a subtree using a subset of the spelling array (from 132 | // item_star to item_end) 133 | // parent is the parent node to update the necessary information 134 | // parent can be a member of LmaNodeLE0 or LmaNodeGE1 135 | bool construct_subset(void* parent, LemmaEntry* lemma_arr, 136 | size_t item_start, size_t item_end, size_t level); 137 | 138 | 139 | // Read valid Chinese Hanzis from the given file. 140 | // num is used to return number of chars. 141 | // The return buffer is sorted and caller needs to free the returned buffer. 142 | char16* read_valid_hanzis(const char *fn_validhzs, size_t *num); 143 | 144 | 145 | // Read a raw dictionary. max_item is the maximum number of items. If there 146 | // are more items in the ditionary, only the first max_item will be read. 147 | // Returned value is the number of items successfully read from the file. 148 | size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs, 149 | size_t max_item); 150 | 151 | // Try to find if a character is in hzs buffer. 152 | bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz); 153 | 154 | // Try to find if all characters in str are in hzs buffer. 155 | bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len, 156 | const char16 *str, size_t str_len); 157 | 158 | // Get these lemmas with toppest scores. 159 | void get_top_lemmas(); 160 | 161 | // Allocate resource to build dictionary. 162 | // lma_num is the number of items to be loaded 163 | bool alloc_resource(size_t lma_num); 164 | 165 | // Free resource. 166 | void free_resource(); 167 | }; 168 | #endif // ___BUILD_MODEL___ 169 | } 170 | 171 | #endif // PINYINIME_INCLUDE_DICTBUILDER_H__ 172 | -------------------------------------------------------------------------------- /pinyin/dictdef.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_DICTDEF_H__ 18 | #define PINYINIME_INCLUDE_DICTDEF_H__ 19 | 20 | #include 21 | #include 22 | #include "./utf16char.h" 23 | 24 | namespace ime_pinyin { 25 | 26 | // Enable the following line when building the binary dictionary model. 27 | #define ___BUILD_MODEL___ 28 | 29 | typedef uint8_t uint8; 30 | typedef uint16_t uint16; 31 | typedef uint32_t uint32; 32 | 33 | typedef int8_t int8; 34 | typedef int16_t int16; 35 | typedef int32_t int32; 36 | typedef int64_t int64; 37 | typedef uint64_t uint64; 38 | 39 | const bool kPrintDebug0 = false; 40 | const bool kPrintDebug1 = false; 41 | const bool kPrintDebug2 = false; 42 | 43 | // The max length of a lemma. 44 | const size_t kMaxLemmaSize = 8; 45 | 46 | // The max length of a Pinyin (spelling). 47 | const size_t kMaxPinyinSize = 6; 48 | 49 | // The number of half spelling ids. For Chinese Pinyin, there 30 half ids. 50 | // See SpellingTrie.h for details. 51 | const size_t kHalfSpellingIdNum = 29; 52 | 53 | // The maximum number of full spellings. For Chinese Pinyin, there are only 54 | // about 410 spellings. 55 | // If change this value is bigger(needs more bits), please also update 56 | // other structures like SpellingNode, to make sure than a spelling id can be 57 | // stored. 58 | // -1 is because that 0 is never used. 59 | const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; 60 | const size_t kMaxSearchSteps = 40; 61 | 62 | // One character predicts its following characters. 63 | const size_t kMaxPredictSize = (kMaxLemmaSize - 1); 64 | 65 | // LemmaIdType must always be size_t. 66 | typedef size_t LemmaIdType; 67 | const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. 68 | const size_t kLemmaIdComposing = 0xffffff; 69 | 70 | typedef uint16 LmaScoreType; 71 | typedef uint16 KeyScoreType; 72 | 73 | // Number of items with highest score are kept for prediction purpose. 74 | const size_t kTopScoreLemmaNum = 10; 75 | 76 | const size_t kMaxPredictNumByGt3 = 1; 77 | const size_t kMaxPredictNumBy3 = 2; 78 | const size_t kMaxPredictNumBy2 = 2; 79 | 80 | // The last lemma id (included) for the system dictionary. The system 81 | // dictionary's ids always start from 1. 82 | const LemmaIdType kSysDictIdEnd = 500000; 83 | 84 | // The first lemma id for the user dictionary. 85 | const LemmaIdType kUserDictIdStart = 500001; 86 | 87 | // The last lemma id (included) for the user dictionary. 88 | const LemmaIdType kUserDictIdEnd = 600000; 89 | 90 | typedef struct { 91 | uint16 half_splid:5; 92 | uint16 full_splid:11; 93 | } SpellingId, *PSpellingId; 94 | 95 | 96 | /** 97 | * We use different node types for different layers 98 | * Statistical data of the building result for a testing dictionary: 99 | * root, level 0, level 1, level 2, level 3 100 | * max son num of one node: 406 280 41 2 - 101 | * max homo num of one node: 0 90 23 2 2 102 | * total node num of a layer: 1 406 31766 13516 993 103 | * total homo num of a layer: 9 5674 44609 12667 995 104 | * 105 | * The node number for root and level 0 won't be larger than 500 106 | * According to the information above, two kinds of nodes can be used; one for 107 | * root and level 0, the other for these layers deeper than 0. 108 | * 109 | * LE = less and equal, 110 | * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K 111 | */ 112 | struct LmaNodeLE0 { 113 | size_t son_1st_off; 114 | size_t homo_idx_buf_off; 115 | uint16 spl_idx; 116 | uint16 num_of_son; 117 | uint16 num_of_homo; 118 | }; 119 | 120 | /** 121 | * GE = great and equal 122 | * A node occupies 8 bytes. 123 | */ 124 | struct LmaNodeGE1 { 125 | uint16 son_1st_off_l; // Low bits of the son_1st_off 126 | uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 127 | uint16 spl_idx; 128 | unsigned char num_of_son; // number of son nodes 129 | unsigned char num_of_homo; // number of homo words 130 | unsigned char son_1st_off_h; // high bits of the son_1st_off 131 | unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off 132 | }; 133 | 134 | #ifdef ___BUILD_MODEL___ 135 | struct SingleCharItem { 136 | float freq; 137 | char16 hz; 138 | SpellingId splid; 139 | }; 140 | 141 | struct LemmaEntry { 142 | LemmaIdType idx_by_py; 143 | LemmaIdType idx_by_hz; 144 | char16 hanzi_str[kMaxLemmaSize + 1]; 145 | 146 | // The SingleCharItem id for each Hanzi. 147 | uint16 hanzi_scis_ids[kMaxLemmaSize]; 148 | 149 | uint16 spl_idx_arr[kMaxLemmaSize + 1]; 150 | char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; 151 | unsigned char hz_str_len; 152 | float freq; 153 | }; 154 | #endif // ___BUILD_MODEL___ 155 | 156 | } // namespace ime_pinyin 157 | 158 | #endif // PINYINIME_INCLUDE_DICTDEF_H__ 159 | -------------------------------------------------------------------------------- /pinyin/dictlist.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "dictlist.h" 21 | #include "mystdlib.h" 22 | #include "ngram.h" 23 | #include "searchutility.h" 24 | 25 | namespace ime_pinyin { 26 | 27 | DictList::DictList() { 28 | initialized_ = false; 29 | scis_num_ = 0; 30 | scis_hz_ = NULL; 31 | scis_splid_ = NULL; 32 | buf_ = NULL; 33 | spl_trie_ = SpellingTrie::get_cpinstance(); 34 | 35 | assert(kMaxLemmaSize == 8); 36 | cmp_func_[0] = cmp_hanzis_1; 37 | cmp_func_[1] = cmp_hanzis_2; 38 | cmp_func_[2] = cmp_hanzis_3; 39 | cmp_func_[3] = cmp_hanzis_4; 40 | cmp_func_[4] = cmp_hanzis_5; 41 | cmp_func_[5] = cmp_hanzis_6; 42 | cmp_func_[6] = cmp_hanzis_7; 43 | cmp_func_[7] = cmp_hanzis_8; 44 | } 45 | 46 | DictList::~DictList() { 47 | free_resource(); 48 | } 49 | 50 | bool DictList::alloc_resource(size_t buf_size, size_t scis_num) { 51 | // Allocate memory 52 | buf_ = static_cast(malloc(buf_size * sizeof(char16))); 53 | if (NULL == buf_) 54 | return false; 55 | 56 | scis_num_ = scis_num; 57 | 58 | scis_hz_ = static_cast(malloc(scis_num_ * sizeof(char16))); 59 | if (NULL == scis_hz_) 60 | return false; 61 | 62 | scis_splid_ = static_cast 63 | (malloc(scis_num_ * sizeof(SpellingId))); 64 | 65 | if (NULL == scis_splid_) 66 | return false; 67 | 68 | return true; 69 | } 70 | 71 | void DictList::free_resource() { 72 | if (NULL != buf_) 73 | free(buf_); 74 | buf_ = NULL; 75 | 76 | if (NULL != scis_hz_) 77 | free(scis_hz_); 78 | scis_hz_ = NULL; 79 | 80 | if (NULL != scis_splid_) 81 | free(scis_splid_); 82 | scis_splid_ = NULL; 83 | } 84 | 85 | #ifdef ___BUILD_MODEL___ 86 | bool DictList::init_list(const SingleCharItem *scis, size_t scis_num, 87 | const LemmaEntry *lemma_arr, size_t lemma_num) { 88 | if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num) 89 | return false; 90 | 91 | initialized_ = false; 92 | 93 | if (NULL != buf_) 94 | free(buf_); 95 | 96 | // calculate the size 97 | size_t buf_size = calculate_size(lemma_arr, lemma_num); 98 | if (0 == buf_size) 99 | return false; 100 | 101 | if (!alloc_resource(buf_size, scis_num)) 102 | return false; 103 | 104 | fill_scis(scis, scis_num); 105 | 106 | // Copy the related content from the array to inner buffer 107 | fill_list(lemma_arr, lemma_num); 108 | 109 | initialized_ = true; 110 | return true; 111 | } 112 | 113 | size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) { 114 | size_t last_hz_len = 0; 115 | size_t list_size = 0; 116 | size_t id_num = 0; 117 | 118 | for (size_t i = 0; i < lemma_num; i++) { 119 | if (0 == i) { 120 | last_hz_len = lemma_arr[i].hz_str_len; 121 | 122 | assert(last_hz_len > 0); 123 | assert(lemma_arr[0].idx_by_hz == 1); 124 | 125 | id_num++; 126 | start_pos_[0] = 0; 127 | start_id_[0] = id_num; 128 | 129 | last_hz_len = 1; 130 | list_size += last_hz_len; 131 | } else { 132 | size_t current_hz_len = lemma_arr[i].hz_str_len; 133 | 134 | assert(current_hz_len >= last_hz_len); 135 | 136 | if (current_hz_len == last_hz_len) { 137 | list_size += current_hz_len; 138 | id_num++; 139 | } else { 140 | for (size_t len = last_hz_len; len < current_hz_len - 1; len++) { 141 | start_pos_[len] = start_pos_[len - 1]; 142 | start_id_[len] = start_id_[len - 1]; 143 | } 144 | 145 | start_pos_[current_hz_len - 1] = list_size; 146 | 147 | id_num++; 148 | start_id_[current_hz_len - 1] = id_num; 149 | 150 | last_hz_len = current_hz_len; 151 | list_size += current_hz_len; 152 | } 153 | } 154 | } 155 | 156 | for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) { 157 | if (0 == i) { 158 | start_pos_[0] = 0; 159 | start_id_[0] = 1; 160 | } else { 161 | start_pos_[i] = list_size; 162 | start_id_[i] = id_num; 163 | } 164 | } 165 | 166 | return start_pos_[kMaxLemmaSize]; 167 | } 168 | 169 | void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) { 170 | assert(scis_num_ == scis_num); 171 | 172 | for (size_t pos = 0; pos < scis_num_; pos++) { 173 | scis_hz_[pos] = scis[pos].hz; 174 | scis_splid_[pos] = scis[pos].splid; 175 | } 176 | } 177 | 178 | void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) { 179 | size_t current_pos = 0; 180 | 181 | utf16_strncpy(buf_, lemma_arr[0].hanzi_str, 182 | lemma_arr[0].hz_str_len); 183 | 184 | current_pos = lemma_arr[0].hz_str_len; 185 | 186 | size_t id_num = 1; 187 | 188 | for (size_t i = 1; i < lemma_num; i++) { 189 | utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str, 190 | lemma_arr[i].hz_str_len); 191 | 192 | id_num++; 193 | current_pos += lemma_arr[i].hz_str_len; 194 | } 195 | 196 | assert(current_pos == start_pos_[kMaxLemmaSize]); 197 | assert(id_num == start_id_[kMaxLemmaSize]); 198 | } 199 | 200 | char16* DictList::find_pos2_startedbyhz(char16 hz_char) { 201 | char16 *found_2w = static_cast 202 | (mybsearch(&hz_char, buf_ + start_pos_[1], 203 | (start_pos_[2] - start_pos_[1]) / 2, 204 | sizeof(char16) * 2, cmp_hanzis_1)); 205 | if (NULL == found_2w) 206 | return NULL; 207 | 208 | while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1)) 209 | found_2w -= 2; 210 | 211 | return found_2w; 212 | } 213 | #endif // ___BUILD_MODEL___ 214 | 215 | char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[], 216 | size_t word_len, int (*cmp_func)(const void *, const void *)) { 217 | char16 *found_w = static_cast 218 | (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1], 219 | (start_pos_[word_len] - start_pos_[word_len - 1]) 220 | / word_len, 221 | sizeof(char16) * word_len, cmp_func)); 222 | 223 | if (NULL == found_w) 224 | return NULL; 225 | 226 | while (found_w > buf_ + start_pos_[word_len -1] && 227 | cmp_func(found_w, found_w - word_len) == 0) 228 | found_w -= word_len; 229 | 230 | return found_w; 231 | } 232 | 233 | size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len, 234 | NPredictItem *npre_items, size_t npre_max, 235 | size_t b4_used) { 236 | assert(hzs_len <= kMaxPredictSize && hzs_len > 0); 237 | 238 | // 1. Prepare work 239 | int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1]; 240 | 241 | NGram& ngram = NGram::get_instance(); 242 | 243 | size_t item_num = 0; 244 | 245 | // 2. Do prediction 246 | for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len; 247 | pre_len++) { 248 | uint16 word_len = hzs_len + pre_len; 249 | char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func); 250 | if (NULL == w_buf) 251 | continue; 252 | while (w_buf < buf_ + start_pos_[word_len] && 253 | cmp_func(w_buf, last_hzs) == 0 && 254 | item_num < npre_max) { 255 | memset(npre_items + item_num, 0, sizeof(NPredictItem)); 256 | utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len); 257 | npre_items[item_num].psb = 258 | ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1]) 259 | / word_len + start_id_[word_len - 1]); 260 | npre_items[item_num].his_len = hzs_len; 261 | item_num++; 262 | w_buf += word_len; 263 | } 264 | } 265 | 266 | size_t new_num = 0; 267 | for (size_t i = 0; i < item_num; i++) { 268 | // Try to find it in the existing items 269 | size_t e_pos; 270 | for (e_pos = 1; e_pos <= b4_used; e_pos++) { 271 | if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs, 272 | kMaxPredictSize) == 0) 273 | break; 274 | } 275 | if (e_pos <= b4_used) 276 | continue; 277 | 278 | // If not found, append it to the buffer 279 | npre_items[new_num] = npre_items[i]; 280 | new_num++; 281 | } 282 | 283 | return new_num; 284 | } 285 | 286 | uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, 287 | uint16 str_max) { 288 | if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf 289 | || str_max <= 1) 290 | return 0; 291 | 292 | // Find the range 293 | for (uint16 i = 0; i < kMaxLemmaSize; i++) { 294 | if (i + 1 > str_max - 1) 295 | return 0; 296 | if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) { 297 | size_t id_span = id_lemma - start_id_[i]; 298 | 299 | uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1); 300 | for (uint16 len = 0; len <= i; len++) { 301 | str_buf[len] = buf[len]; 302 | } 303 | str_buf[i+1] = (char16)'\0'; 304 | return i + 1; 305 | } 306 | } 307 | return 0; 308 | } 309 | 310 | uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid, 311 | uint16 *splids, uint16 max_splids) { 312 | char16 *hz_found = static_cast 313 | (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1)); 314 | assert(NULL != hz_found && hanzi == *hz_found); 315 | 316 | // Move to the first one. 317 | while (hz_found > scis_hz_ && hanzi == *(hz_found - 1)) 318 | hz_found--; 319 | 320 | // First try to found if strict comparison result is not zero. 321 | char16 *hz_f = hz_found; 322 | bool strict = false; 323 | while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) { 324 | uint16 pos = hz_f - scis_hz_; 325 | if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) { 326 | strict = true; 327 | } 328 | hz_f++; 329 | } 330 | 331 | uint16 found_num = 0; 332 | while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) { 333 | uint16 pos = hz_found - scis_hz_; 334 | if (0 == half_splid || 335 | (strict && scis_splid_[pos].half_splid == half_splid) || 336 | (!strict && spl_trie_->half_full_compatible(half_splid, 337 | scis_splid_[pos].full_splid))) { 338 | assert(found_num + 1 < max_splids); 339 | splids[found_num] = scis_splid_[pos].full_splid; 340 | found_num++; 341 | } 342 | hz_found++; 343 | } 344 | 345 | return found_num; 346 | } 347 | 348 | LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) { 349 | if (NULL == str || str_len > kMaxLemmaSize) 350 | return 0; 351 | 352 | char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]); 353 | if (NULL == found) 354 | return 0; 355 | 356 | assert(found > buf_); 357 | assert(static_cast(found - buf_) >= start_pos_[str_len - 1]); 358 | return static_cast 359 | (start_id_[str_len - 1] + 360 | (found - buf_ - start_pos_[str_len - 1]) / str_len); 361 | } 362 | 363 | void DictList::convert_to_hanzis(char16 *str, uint16 str_len) { 364 | assert(NULL != str); 365 | 366 | for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { 367 | str[str_pos] = scis_hz_[str[str_pos]]; 368 | } 369 | } 370 | 371 | void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) { 372 | assert(NULL != str); 373 | 374 | for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { 375 | str[str_pos] = 0x100; 376 | } 377 | } 378 | 379 | bool DictList::save_list(FILE *fp) { 380 | if (!initialized_ || NULL == fp) 381 | return false; 382 | 383 | if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] || 384 | NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_) 385 | return false; 386 | 387 | if (fwrite(&scis_num_, sizeof(size_t), 1, fp) != 1) 388 | return false; 389 | 390 | if (fwrite(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) != 391 | kMaxLemmaSize + 1) 392 | return false; 393 | 394 | if (fwrite(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) != 395 | kMaxLemmaSize + 1) 396 | return false; 397 | 398 | if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) 399 | return false; 400 | 401 | if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) 402 | return false; 403 | 404 | if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != 405 | start_pos_[kMaxLemmaSize]) 406 | return false; 407 | 408 | return true; 409 | } 410 | 411 | bool DictList::load_list(FILE *fp) { 412 | if (NULL == fp) 413 | return false; 414 | 415 | initialized_ = false; 416 | 417 | if (fread(&scis_num_, sizeof(size_t), 1, fp) != 1) 418 | return false; 419 | 420 | if (fread(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) != 421 | kMaxLemmaSize + 1) 422 | return false; 423 | 424 | if (fread(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) != 425 | kMaxLemmaSize + 1) 426 | return false; 427 | 428 | free_resource(); 429 | 430 | if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_)) 431 | return false; 432 | 433 | if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) 434 | return false; 435 | 436 | if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) 437 | return false; 438 | 439 | if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != 440 | start_pos_[kMaxLemmaSize]) 441 | return false; 442 | 443 | initialized_ = true; 444 | return true; 445 | } 446 | } // namespace ime_pinyin 447 | -------------------------------------------------------------------------------- /pinyin/dictlist.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_DICTLIST_H__ 18 | #define PINYINIME_INCLUDE_DICTLIST_H__ 19 | 20 | #include 21 | #include 22 | #include "./dictdef.h" 23 | #include "./searchutility.h" 24 | #include "./spellingtrie.h" 25 | #include "./utf16char.h" 26 | 27 | namespace ime_pinyin { 28 | 29 | class DictList { 30 | private: 31 | bool initialized_; 32 | 33 | const SpellingTrie *spl_trie_; 34 | 35 | // Number of SingCharItem. The first is blank, because id 0 is invalid. 36 | size_t scis_num_; 37 | char16 *scis_hz_; 38 | SpellingId *scis_splid_; 39 | 40 | // The large memory block to store the word list. 41 | char16 *buf_; 42 | 43 | // Starting position of those words whose lengths are i+1, counted in 44 | // char16 45 | size_t start_pos_[kMaxLemmaSize + 1]; 46 | 47 | size_t start_id_[kMaxLemmaSize + 1]; 48 | 49 | int (*cmp_func_[kMaxLemmaSize])(const void *, const void *); 50 | 51 | bool alloc_resource(size_t buf_size, size_t scim_num); 52 | 53 | void free_resource(); 54 | 55 | #ifdef ___BUILD_MODEL___ 56 | // Calculate the requsted memory, including the start_pos[] buffer. 57 | size_t calculate_size(const LemmaEntry *lemma_arr, size_t lemma_num); 58 | 59 | void fill_scis(const SingleCharItem *scis, size_t scis_num); 60 | 61 | // Copy the related content to the inner buffer 62 | // It should be called after calculate_size() 63 | void fill_list(const LemmaEntry *lemma_arr, size_t lemma_num); 64 | 65 | // Find the starting position for the buffer of those 2-character Chinese word 66 | // whose first character is the given Chinese character. 67 | char16* find_pos2_startedbyhz(char16 hz_char); 68 | #endif 69 | 70 | // Find the starting position for the buffer of those words whose lengths are 71 | // word_len. The given parameter cmp_func decides how many characters from 72 | // beginning will be used to compare. 73 | char16* find_pos_startedbyhzs(const char16 last_hzs[], 74 | size_t word_Len, 75 | int (*cmp_func)(const void *, const void *)); 76 | 77 | public: 78 | 79 | DictList(); 80 | ~DictList(); 81 | 82 | bool save_list(FILE *fp); 83 | bool load_list(FILE *fp); 84 | 85 | #ifdef ___BUILD_MODEL___ 86 | // Init the list from the LemmaEntry array. 87 | // lemma_arr should have been sorted by the hanzi_str, and have been given 88 | // ids from 1 89 | bool init_list(const SingleCharItem *scis, size_t scis_num, 90 | const LemmaEntry *lemma_arr, size_t lemma_num); 91 | #endif 92 | 93 | // Get the hanzi string for the given id 94 | uint16 get_lemma_str(LemmaIdType id_hz, char16 *str_buf, uint16 str_max); 95 | 96 | void convert_to_hanzis(char16 *str, uint16 str_len); 97 | 98 | void convert_to_scis_ids(char16 *str, uint16 str_len); 99 | 100 | // last_hzs stores the last n Chinese characters history, its length should be 101 | // less or equal than kMaxPredictSize. 102 | // hzs_len specifies the length(<= kMaxPredictSize). 103 | // predict_buf is used to store the result. 104 | // buf_len specifies the buffer length. 105 | // b4_used specifies how many items before predict_buf have been used. 106 | // Returned value is the number of newly added items. 107 | size_t predict(const char16 last_hzs[], uint16 hzs_len, 108 | NPredictItem *npre_items, size_t npre_max, 109 | size_t b4_used); 110 | 111 | // If half_splid is a valid half spelling id, return those full spelling 112 | // ids which share this half id. 113 | uint16 get_splids_for_hanzi(char16 hanzi, uint16 half_splid, 114 | uint16 *splids, uint16 max_splids); 115 | 116 | LemmaIdType get_lemma_id(const char16 *str, uint16 str_len); 117 | }; 118 | } 119 | 120 | #endif // PINYINIME_INCLUDE_DICTLIST_H__ 121 | -------------------------------------------------------------------------------- /pinyin/dicttrie.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_DICTTRIE_H__ 18 | #define PINYINIME_INCLUDE_DICTTRIE_H__ 19 | 20 | #include 21 | #include "./atomdictbase.h" 22 | #include "./dictdef.h" 23 | #include "./dictlist.h" 24 | #include "./searchutility.h" 25 | 26 | namespace ime_pinyin { 27 | 28 | class DictTrie : AtomDictBase { 29 | private: 30 | struct ParsingMark { 31 | size_t node_offset:24; 32 | size_t node_num:8; // Number of nodes with this spelling id given 33 | // by spl_id. If spl_id is a Shengmu, for nodes 34 | // in the first layer of DictTrie, it equals to 35 | // SpellingTrie::shm2full_num(); but for those 36 | // nodes which are not in the first layer, 37 | // node_num < SpellingTrie::shm2full_num(). 38 | // For a full spelling id, node_num = 1; 39 | }; 40 | 41 | // Used to indicate an extended mile stone. 42 | // An extended mile stone is used to mark a partial match in the dictionary 43 | // trie to speed up further potential extending. 44 | // For example, when the user inputs "w", a mile stone is created to mark the 45 | // partial match status, so that when user inputs another char 'm', it will be 46 | // faster to extend search space based on this mile stone. 47 | // 48 | // For partial match status of "wm", there can be more than one sub mile 49 | // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so 50 | // there may be more one parsing mark used to mark these partial matchings. 51 | // A mile stone records the starting position in the mark list and number of 52 | // marks. 53 | struct MileStone { 54 | uint16 mark_start; 55 | uint16 mark_num; 56 | }; 57 | 58 | DictList* dict_list_; 59 | 60 | const SpellingTrie *spl_trie_; 61 | 62 | LmaNodeLE0* root_; // Nodes for root and the first layer. 63 | LmaNodeGE1* nodes_ge1_; // Nodes for other layers. 64 | 65 | // An quick index from spelling id to the LmaNodeLE0 node buffer, or 66 | // to the root_ buffer. 67 | // Index length: 68 | // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used 69 | // to get the end. 70 | // All Shengmu ids are not indexed because they will be converted into 71 | // corresponding full ids. 72 | // So, given an id splid, the son is: 73 | // root_[splid_le0_index_[splid - kFullSplIdStart]] 74 | uint16 *splid_le0_index_; 75 | 76 | size_t lma_node_num_le0_; 77 | size_t lma_node_num_ge1_; 78 | 79 | // The first part is for homophnies, and the last top_lma_num_ items are 80 | // lemmas with highest scores. 81 | unsigned char *lma_idx_buf_; 82 | size_t lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. 83 | size_t total_lma_num_; // Total number of lemmas in this dictionary. 84 | size_t top_lmas_num_; // Number of lemma with highest scores. 85 | 86 | // Parsing mark list used to mark the detailed extended statuses. 87 | ParsingMark *parsing_marks_; 88 | // The position for next available mark. 89 | uint16 parsing_marks_pos_; 90 | 91 | // Mile stone list used to mark the extended status. 92 | MileStone *mile_stones_; 93 | // The position for the next available mile stone. We use positions (except 0) 94 | // as handles. 95 | MileStoneHandle mile_stones_pos_; 96 | 97 | // Get the offset of sons for a node. 98 | inline size_t get_son_offset(const LmaNodeGE1 *node); 99 | 100 | // Get the offset of homonious ids for a node. 101 | inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); 102 | 103 | // Get the lemma id by the offset. 104 | inline LemmaIdType get_lemma_id(size_t id_offset); 105 | 106 | void free_resource(bool free_dict_list); 107 | 108 | bool load_dict(FILE *fp); 109 | 110 | // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill 111 | // them into the lpi_items buffer. 112 | // This function is called by the search engine. 113 | size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, 114 | LmaNodeLE0 *node); 115 | 116 | // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill 117 | // them into the lpi_items buffer. 118 | // This function is called by inner functions extend_dict0(), extend_dict1() 119 | // and extend_dict2(). 120 | size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, 121 | size_t homo_buf_off, LmaNodeGE1 *node, 122 | uint16 lma_len); 123 | 124 | // Extend in the trie from level 0. 125 | MileStoneHandle extend_dict0(MileStoneHandle from_handle, 126 | const DictExtPara *dep, LmaPsbItem *lpi_items, 127 | size_t lpi_max, size_t *lpi_num); 128 | 129 | // Extend in the trie from level 1. 130 | MileStoneHandle extend_dict1(MileStoneHandle from_handle, 131 | const DictExtPara *dep, LmaPsbItem *lpi_items, 132 | size_t lpi_max, size_t *lpi_num); 133 | 134 | // Extend in the trie from level 2. 135 | MileStoneHandle extend_dict2(MileStoneHandle from_handle, 136 | const DictExtPara *dep, LmaPsbItem *lpi_items, 137 | size_t lpi_max, size_t *lpi_num); 138 | 139 | // Try to extend the given spelling id buffer, and if the given id_lemma can 140 | // be successfully gotten, return true; 141 | // The given spelling ids are all valid full ids. 142 | bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); 143 | 144 | #ifdef ___BUILD_MODEL___ 145 | bool save_dict(FILE *fp); 146 | #endif // ___BUILD_MODEL___ 147 | 148 | static const int kMaxMileStone = 100; 149 | static const int kMaxParsingMark = 600; 150 | static const MileStoneHandle kFirstValidMileStoneHandle = 1; 151 | 152 | friend class DictParser; 153 | friend class DictBuilder; 154 | 155 | public: 156 | 157 | DictTrie(); 158 | ~DictTrie(); 159 | 160 | #ifdef ___BUILD_MODEL___ 161 | // Construct the tree from the file fn_raw. 162 | // fn_validhzs provide the valid hanzi list. If fn_validhzs is 163 | // NULL, only chars in GB2312 will be included. 164 | bool build_dict(const char *fn_raw, const char *fn_validhzs); 165 | 166 | // Save the binary dictionary 167 | // Actually, the SpellingTrie/DictList instance will be also saved. 168 | bool save_dict(const char *filename); 169 | #endif // ___BUILD_MODEL___ 170 | 171 | void convert_to_hanzis(char16 *str, uint16 str_len); 172 | 173 | void convert_to_scis_ids(char16 *str, uint16 str_len); 174 | 175 | // Load a binary dictionary 176 | // The SpellingTrie instance/DictList will be also loaded 177 | bool load_dict(const char *filename, LemmaIdType start_id, 178 | LemmaIdType end_id); 179 | bool load_dict_fd(int sys_fd, long start_offset, long length, 180 | LemmaIdType start_id, LemmaIdType end_id); 181 | bool close_dict() {return true;} 182 | size_t number_of_lemmas() {return 0;} 183 | 184 | void reset_milestones(uint16 from_step, MileStoneHandle from_handle); 185 | 186 | MileStoneHandle extend_dict(MileStoneHandle from_handle, 187 | const DictExtPara *dep, 188 | LmaPsbItem *lpi_items, 189 | size_t lpi_max, size_t *lpi_num); 190 | 191 | size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 192 | LmaPsbItem *lpi_items, size_t lpi_max); 193 | 194 | uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); 195 | 196 | uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 197 | uint16 splids_max, bool arg_valid); 198 | 199 | size_t predict(const char16 *last_hzs, uint16 hzs_len, 200 | NPredictItem *npre_items, size_t npre_max, 201 | size_t b4_used); 202 | 203 | LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], 204 | uint16 lemma_len, uint16 count) {return 0;} 205 | 206 | LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, 207 | bool selected) {return 0;} 208 | 209 | LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], 210 | uint16 lemma_len) {return 0;} 211 | 212 | LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;} 213 | 214 | LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], 215 | uint16 lemma_len) {return 0;} 216 | 217 | bool remove_lemma(LemmaIdType lemma_id) {return false;} 218 | 219 | size_t get_total_lemma_count() {return 0;} 220 | void set_total_lemma_count_of_others(size_t count); 221 | 222 | void flush_cache() {} 223 | 224 | LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); 225 | 226 | // Fill the lemmas with highest scores to the prediction buffer. 227 | // his_len is the history length to fill in the prediction buffer. 228 | size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, 229 | size_t npre_max, size_t b4_used); 230 | }; 231 | } 232 | 233 | #endif // PINYINIME_INCLUDE_DICTTRIE_H__ 234 | -------------------------------------------------------------------------------- /pinyin/lpicache.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "lpicache.h" 19 | 20 | namespace ime_pinyin { 21 | 22 | LpiCache* LpiCache::instance_ = NULL; 23 | 24 | LpiCache::LpiCache() { 25 | lpi_cache_ = new LmaPsbItem[kFullSplIdStart * kMaxLpiCachePerId]; 26 | lpi_cache_len_ = new uint16[kFullSplIdStart]; 27 | assert(NULL != lpi_cache_); 28 | assert(NULL != lpi_cache_len_); 29 | for (uint16 id = 0; id < kFullSplIdStart; id++) 30 | lpi_cache_len_[id] = 0; 31 | } 32 | 33 | LpiCache::~LpiCache() { 34 | if (NULL != lpi_cache_) 35 | delete [] lpi_cache_; 36 | 37 | if (NULL != lpi_cache_len_) 38 | delete [] lpi_cache_len_; 39 | } 40 | 41 | LpiCache& LpiCache::get_instance() { 42 | if (NULL == instance_) { 43 | instance_ = new LpiCache(); 44 | assert(NULL != instance_); 45 | } 46 | return *instance_; 47 | } 48 | 49 | bool LpiCache::is_cached(uint16 splid) { 50 | if (splid >= kFullSplIdStart) 51 | return false; 52 | return lpi_cache_len_[splid] != 0; 53 | } 54 | 55 | size_t LpiCache::put_cache(uint16 splid, LmaPsbItem lpi_items[], 56 | size_t lpi_num) { 57 | uint16 num = kMaxLpiCachePerId; 58 | if (num > lpi_num) 59 | num = static_cast(lpi_num); 60 | 61 | LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId; 62 | for (uint16 pos = 0; pos < num; pos++) 63 | lpi_cache_this[pos] = lpi_items[pos]; 64 | 65 | lpi_cache_len_[splid] = num; 66 | return num; 67 | } 68 | 69 | size_t LpiCache::get_cache(uint16 splid, LmaPsbItem lpi_items[], 70 | size_t lpi_max) { 71 | if (lpi_max > lpi_cache_len_[splid]) 72 | lpi_max = lpi_cache_len_[splid]; 73 | 74 | LmaPsbItem *lpi_cache_this = lpi_cache_ + splid * kMaxLpiCachePerId; 75 | for (uint16 pos = 0; pos < lpi_max; pos++) { 76 | lpi_items[pos] = lpi_cache_this[pos]; 77 | } 78 | return lpi_max; 79 | } 80 | 81 | } // namespace ime_pinyin 82 | -------------------------------------------------------------------------------- /pinyin/lpicache.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ 18 | #define PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ 19 | 20 | #include 21 | #include "./searchutility.h" 22 | #include "./spellingtrie.h" 23 | 24 | namespace ime_pinyin { 25 | 26 | // Used to cache LmaPsbItem list for half spelling ids. 27 | class LpiCache { 28 | private: 29 | static LpiCache *instance_; 30 | static const int kMaxLpiCachePerId = 15; 31 | 32 | LmaPsbItem *lpi_cache_; 33 | uint16 *lpi_cache_len_; 34 | 35 | public: 36 | LpiCache(); 37 | ~LpiCache(); 38 | 39 | static LpiCache& get_instance(); 40 | 41 | // Test if the LPI list of the given splid has been cached. 42 | // If splid is a full spelling id, it returns false, because we only cache 43 | // list for half ids. 44 | bool is_cached(uint16 splid); 45 | 46 | // Put LPI list to cahce. If the length of the list, lpi_num, is longer than 47 | // the cache buffer. the list will be truncated, and function returns the 48 | // maximum length of the cache buffer. 49 | // Note: splid must be a half id, and lpi_items must be not NULL. The 50 | // caller of this function should guarantee this. 51 | size_t put_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_num); 52 | 53 | // Get the cached list for the given half id. 54 | // Return the length of the cached buffer. 55 | // Note: splid must be a half id, and lpi_items must be not NULL. The 56 | // caller of this function should guarantee this. 57 | size_t get_cache(uint16 splid, LmaPsbItem lpi_items[], size_t lpi_max); 58 | }; 59 | 60 | } // namespace 61 | 62 | #endif // PINYINIME_ANDPY_INCLUDE_LPICACHE_H__ 63 | -------------------------------------------------------------------------------- /pinyin/matrixsearch.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ 18 | #define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ 19 | 20 | #include 21 | #include "./atomdictbase.h" 22 | #include "./dicttrie.h" 23 | #include "./searchutility.h" 24 | #include "./spellingtrie.h" 25 | #include "./splparser.h" 26 | 27 | namespace ime_pinyin { 28 | 29 | static const size_t kMaxRowNum = kMaxSearchSteps; 30 | 31 | typedef struct { 32 | // MileStoneHandle objects for the system and user dictionaries. 33 | MileStoneHandle dict_handles[2]; 34 | // From which DMI node. -1 means it's from root. 35 | PoolPosType dmi_fr; 36 | // The spelling id for the Pinyin string from the previous DMI to this node. 37 | // If it is a half id like Shengmu, the node pointed by dict_node is the first 38 | // node with this Shengmu, 39 | uint16 spl_id; 40 | // What's the level of the dict node. Level of root is 0, but root is never 41 | // recorded by dict_node. 42 | unsigned char dict_level:7; 43 | // If this node is for composing phrase, this bit is 1. 44 | unsigned char c_phrase:1; 45 | // Whether the spl_id is parsed with a split character at the end. 46 | unsigned char splid_end_split:1; 47 | // What's the length of the spelling string for this match, for the whole 48 | // word. 49 | unsigned char splstr_len:7; 50 | // Used to indicate whether all spelling ids from the root are full spelling 51 | // ids. This information is useful for keymapping mode(not finished). Because 52 | // in this mode, there is no clear boundaries, we prefer those results which 53 | // have full spelling ids. 54 | unsigned char all_full_id:1; 55 | } DictMatchInfo, *PDictMatchInfo; 56 | 57 | typedef struct MatrixNode { 58 | LemmaIdType id; 59 | float score; 60 | MatrixNode *from; 61 | // From which DMI node. Used to trace the spelling segmentation. 62 | PoolPosType dmi_fr; 63 | uint16 step; 64 | } MatrixNode, *PMatrixNode; 65 | 66 | typedef struct { 67 | // The MatrixNode position in the matrix pool 68 | PoolPosType mtrx_nd_pos; 69 | // The DictMatchInfo position in the DictMatchInfo pool. 70 | PoolPosType dmi_pos; 71 | uint16 mtrx_nd_num; 72 | uint16 dmi_num:15; 73 | // Used to indicate whether there are dmi nodes in this step with full 74 | // spelling id. This information is used to decide whether a substring of a 75 | // valid Pinyin should be extended. 76 | // 77 | // Example1: shoudao 78 | // When the last char 'o' is added, the parser will find "dao" is a valid 79 | // Pinyin, and because all dmi nodes at location 'd' (including those for 80 | // "shoud", and those for "d") have Shengmu id only, so it is not necessary 81 | // to extend "ao", otherwise the result may be "shoud ao", that is not 82 | // reasonable. 83 | // 84 | // Example2: hengao 85 | // When the last 'o' is added, the parser finds "gao" is a valid Pinyin. 86 | // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi 87 | // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus 88 | // "heng ao" can also be the result. 89 | // 90 | // Similarly, "ganga" is expanded to "gang a". 91 | // 92 | // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi 93 | // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it 94 | // is not valid either). If the parser uses break in the loop, the result 95 | // always be "xian"; but if the parser uses continue in the loop, "xi an" will 96 | // also be tried. This behaviour can be set via the function 97 | // set_xi_an_switch(). 98 | uint16 dmi_has_full_id:1; 99 | // Points to a MatrixNode of the current step to indicate which choice the 100 | // user selects. 101 | MatrixNode *mtrx_nd_fixed; 102 | } MatrixRow, *PMatrixRow; 103 | 104 | // When user inputs and selects candidates, the fixed lemma ids are stored in 105 | // lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many 106 | // lemmas from the beginning are fixed. If user deletes Pinyin characters one 107 | // by one from the end, these fixed lemmas can be unlocked one by one when 108 | // necessary. Whenever user deletes a Chinese character and its spelling string 109 | // in these fixed lemmas, all fixed lemmas will be merged together into a unit 110 | // named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing 111 | // phrase will be the first lemma in the sentence. Because it contains some 112 | // modified lemmas (by deleting a character), these merged lemmas are called 113 | // sub lemmas (sublma), and each of them are represented individually, so that 114 | // when user deletes Pinyin characters from the end, these sub lemmas can also 115 | // be unlocked one by one. 116 | typedef struct { 117 | uint16 spl_ids[kMaxRowNum]; 118 | uint16 spl_start[kMaxRowNum]; 119 | char16 chn_str[kMaxRowNum]; // Chinese string. 120 | uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters. 121 | size_t sublma_num; 122 | uint16 length; // Counted in Chinese characters. 123 | } ComposingPhrase, *TComposingPhrase; 124 | 125 | class MatrixSearch { 126 | private: 127 | // If it is true, prediction list by string whose length is greater than 1 128 | // will be limited to a reasonable number. 129 | static const bool kPredictLimitGt1 = false; 130 | 131 | // If it is true, the engine will prefer long history based prediction, 132 | // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are 133 | // based on the two-character history. 134 | static const bool kPreferLongHistoryPredict = true; 135 | 136 | // If it is true, prediction will only be based on user dictionary. this flag 137 | // is for debug purpose. 138 | static const bool kOnlyUserDictPredict = false; 139 | 140 | // The maximum buffer to store LmaPsbItems. 141 | static const size_t kMaxLmaPsbItems = 1450; 142 | 143 | // How many rows for each step. 144 | static const size_t kMaxNodeARow = 5; 145 | 146 | // The maximum length of the sentence candidates counted in chinese 147 | // characters 148 | static const size_t kMaxSentenceLength = 16; 149 | 150 | // The size of the matrix node pool. 151 | static const size_t kMtrxNdPoolSize = 200; 152 | 153 | // The size of the DMI node pool. 154 | static const size_t kDmiPoolSize = 800; 155 | 156 | // Used to indicate whether this object has been initialized. 157 | bool inited_; 158 | 159 | // Spelling trie. 160 | const SpellingTrie *spl_trie_; 161 | 162 | // Used to indicate this switcher status: when "xian" is parseed, should 163 | // "xi an" also be extended. Default is false. 164 | // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string 165 | // should be valid for a FULL spelling, or a combination of two spellings, 166 | // first of which is a FULL id too. So even it is true, "da" will never be 167 | // split into "d a", because "d" is not a full spelling id. 168 | bool xi_an_enabled_; 169 | 170 | // System dictionary. 171 | DictTrie* dict_trie_; 172 | 173 | // User dictionary. 174 | AtomDictBase* user_dict_; 175 | 176 | // Spelling parser. 177 | SpellingParser* spl_parser_; 178 | 179 | // The maximum allowed length of spelling string (such as a Pinyin string). 180 | size_t max_sps_len_; 181 | 182 | // The maximum allowed length of a result Chinese string. 183 | size_t max_hzs_len_; 184 | 185 | // Pinyin string. Max length: kMaxRowNum - 1 186 | char pys_[kMaxRowNum]; 187 | 188 | // The length of the string that has been decoded successfully. 189 | size_t pys_decoded_len_; 190 | 191 | // Shared buffer for multiple purposes. 192 | size_t *share_buf_; 193 | 194 | MatrixNode *mtrx_nd_pool_; 195 | PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool 196 | DictMatchInfo *dmi_pool_; 197 | PoolPosType dmi_pool_used_; // How many items used in the pool 198 | 199 | MatrixRow *matrix_; // The first row is for starting 200 | 201 | DictExtPara *dep_; // Parameter used to extend DMI nodes. 202 | 203 | NPredictItem *npre_items_; // Used to do prediction 204 | size_t npre_items_len_; 205 | 206 | // The starting positions and lemma ids for the full sentence candidate. 207 | size_t lma_id_num_; 208 | uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids. 209 | LemmaIdType lma_id_[kMaxRowNum]; 210 | size_t fixed_lmas_; 211 | 212 | // If fixed_lmas_ is bigger than i, Element i is used to indicate whether 213 | // the i'th lemma id in lma_id_ is the first candidate for that step. 214 | // If all candidates are the first one for that step, the whole string can be 215 | // decoded by the engine automatically, so no need to add it to user 216 | // dictionary. (We are considering to add it to user dictionary in the 217 | // future). 218 | uint8 fixed_lmas_no1_[kMaxRowNum]; 219 | 220 | // Composing phrase 221 | ComposingPhrase c_phrase_; 222 | 223 | // If dmi_c_phrase_ is true, the decoder will try to match the 224 | // composing phrase (And definitely it will match successfully). If it 225 | // is false, the decoder will try to match lemmas items in dictionaries. 226 | bool dmi_c_phrase_; 227 | 228 | // The starting positions and spelling ids for the first full sentence 229 | // candidate. 230 | size_t spl_id_num_; // Number of splling ids 231 | uint16 spl_start_[kMaxRowNum]; // Starting positions 232 | uint16 spl_id_[kMaxRowNum]; // Spelling ids 233 | // Used to remember the last fixed position, counted in Hanzi. 234 | size_t fixed_hzs_; 235 | 236 | // Lemma Items with possibility score, two purposes: 237 | // 1. In Viterbi decoding, this buffer is used to get all possible candidates 238 | // for current step; 239 | // 2. When the search is done, this buffer is used to get candiates from the 240 | // first un-fixed step and show them to the user. 241 | LmaPsbItem lpi_items_[kMaxLmaPsbItems]; 242 | size_t lpi_total_; 243 | 244 | // Assign the pointers with NULL. The caller makes sure that all pointers are 245 | // not valid before calling it. This function only will be called in the 246 | // construction function and free_resource(). 247 | void reset_pointers_to_null(); 248 | 249 | bool alloc_resource(); 250 | 251 | void free_resource(); 252 | 253 | // Reset the search space totally. 254 | bool reset_search0(); 255 | 256 | // Reset the search space from ch_pos step. For example, if the original 257 | // input Pinyin is "an", reset_search(1) will reset the search space to the 258 | // result of "a". If the given position is out of range, return false. 259 | // if clear_fixed_this_step is true, and the ch_pos step is a fixed step, 260 | // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes. 261 | // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step. 262 | // The DMI nodes will be kept. 263 | // 264 | // Note: this function should not destroy content of pys_. 265 | bool reset_search(size_t ch_pos, bool clear_fixed_this_step, 266 | bool clear_dmi_this_step, bool clear_mtrx_this_step); 267 | 268 | // Delete a part of the content in pys_. 269 | void del_in_pys(size_t start, size_t len); 270 | 271 | // Delete a spelling id and its corresponding Chinese character, and merge 272 | // the fixed lemmas into the composing phrase. 273 | // del_spl_pos indicates which spelling id needs to be delete. 274 | // This function will update the lemma and spelling segmentation information. 275 | // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within 276 | // the fixed lemmas. 277 | void merge_fixed_lmas(size_t del_spl_pos); 278 | 279 | // Get spelling start posistions and ids. The result will be stored in 280 | // spl_id_num_, spl_start_[], spl_id_[]. 281 | // fixed_hzs_ will be also assigned. 282 | void get_spl_start_id(); 283 | 284 | // Get all lemma ids with match the given spelling id stream(shorter than the 285 | // maximum length of a word). 286 | // If pfullsent is not NULL, means the full sentence candidate may be the 287 | // same with the coming lemma string, if so, remove that lemma. 288 | // The result is sorted in descendant order by the frequency score. 289 | size_t get_lpis(const uint16* splid_str, size_t splid_str_len, 290 | LmaPsbItem* lma_buf, size_t max_lma_buf, 291 | const char16 *pfullsent, bool sort_by_psb); 292 | 293 | uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); 294 | 295 | uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 296 | uint16 splids_max, bool arg_valid); 297 | 298 | 299 | // Extend a DMI node with a spelling id. ext_len is the length of the rows 300 | // to extend, actually, it is the size of the spelling string of splid. 301 | // return value can be 1 or 0. 302 | // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in 303 | // the pool). 304 | // 0 means either the dmi node can not be extended with splid, or the splid 305 | // is a Shengmu id, which is only used to get lpi_items, or the result node 306 | // in DictTrie has no son, it is not nccessary to keep the new DMI. 307 | // 308 | // This function modifies the content of lpi_items_ and lpi_total_. 309 | // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size. 310 | // The function's returned value has no relation with the value of lpi_num. 311 | // 312 | // If dmi == NULL, this function will extend the root node of DictTrie 313 | // 314 | // This function will not change dmi_nd_pool_used_. Please change it after 315 | // calling this function if necessary. 316 | // 317 | // The caller should guarantees that NULL != dep. 318 | size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s); 319 | 320 | // Extend dmi for the composing phrase. 321 | size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s); 322 | 323 | // Extend a MatrixNode with the give LmaPsbItem list. 324 | // res_row is the destination row number. 325 | // This function does not change mtrx_nd_pool_used_. Please change it after 326 | // calling this function if necessary. 327 | // return 0 always. 328 | size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], 329 | size_t lpi_num, PoolPosType dmi_fr, size_t res_row); 330 | 331 | 332 | // Try to find a dmi node at step_to position, and the found dmi node should 333 | // match the given spelling id strings. 334 | PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num); 335 | 336 | bool add_char(char ch); 337 | bool prepare_add_char(char ch); 338 | 339 | // Called after prepare_add_char, so the input char has been saved. 340 | bool add_char_qwerty(); 341 | 342 | // Prepare candidates from the last fixed hanzi position. 343 | void prepare_candidates(); 344 | 345 | // Is the character in step pos a splitter character? 346 | // The caller guarantees that the position is valid. 347 | bool is_split_at(uint16 pos); 348 | 349 | void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, 350 | PoolPosType dmi_fr, 351 | uint16 spl_id, uint16 node_num, unsigned char dict_level, 352 | bool splid_end_split, unsigned char splstr_len, 353 | unsigned char all_full_id); 354 | 355 | size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num, 356 | char16 predict_buf[][kMaxPredictSize + 1], 357 | size_t buf_len); 358 | 359 | // Add the first candidate to the user dictionary. 360 | bool try_add_cand0_to_userdict(); 361 | 362 | // Add a user lemma to the user dictionary. This lemma is a subset of 363 | // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the 364 | // number of lemmas to be combined together as a new lemma. The caller 365 | // gurantees that the combined new lemma's length is less or equal to 366 | // kMaxLemmaSize. 367 | bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score); 368 | 369 | // Update dictionary frequencies. 370 | void update_dict_freq(); 371 | 372 | void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level); 373 | 374 | public: 375 | MatrixSearch(); 376 | ~MatrixSearch(); 377 | 378 | bool init(const char *fn_sys_dict, const char *fn_usr_dict); 379 | 380 | bool init_fd(int sys_fd, long start_offset, long length, 381 | const char *fn_usr_dict); 382 | 383 | void set_max_lens(size_t max_sps_len, size_t max_hzs_len); 384 | 385 | void close(); 386 | 387 | void flush_cache(); 388 | 389 | void set_xi_an_switch(bool xi_an_enabled); 390 | 391 | bool get_xi_an_switch(); 392 | 393 | // Reset the search space. Equivalent to reset_search(0). 394 | // If inited, always return true; 395 | bool reset_search(); 396 | 397 | // Search a Pinyin string. 398 | // Return value is the position successfully parsed. 399 | size_t search(const char *py, size_t py_len); 400 | 401 | // Used to delete something in the Pinyin string kept by the engine, and do 402 | // a re-search. 403 | // Return value is the new length of Pinyin string kept by the engine which 404 | // is parsed successfully. 405 | // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin 406 | // character needs to be deleted. If is_pos_in_splid is true, all Pinyin 407 | // characters for pos-th spelling id needs to be deleted. 408 | // If the deleted character(s) is just after a fixed lemma or sub lemma in 409 | // composing phrase, clear_fixed_this_step indicates whether we needs to 410 | // unlock the last fixed lemma or sub lemma. 411 | // If is_pos_in_splid is false, and pos-th character is in the range for the 412 | // fixed lemmas or composing string, this function will do nothing and just 413 | // return the result of the previous search. 414 | size_t delsearch(size_t pos, bool is_pos_in_splid, 415 | bool clear_fixed_this_step); 416 | 417 | // Get the number of candiates, called after search(). 418 | size_t get_candidate_num(); 419 | 420 | // Get the Pinyin string stored by the engine. 421 | // *decoded_len returns the length of the successfully decoded string. 422 | const char* get_pystr(size_t *decoded_len); 423 | 424 | // Get the spelling boundaries for the first sentence candidate. 425 | // Number of spellings will be returned. The number of valid elements in 426 | // spl_start is one more than the return value because the last one is used 427 | // to indicate the beginning of the next un-input speling. 428 | // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] . 429 | size_t get_spl_start(const uint16 *&spl_start); 430 | 431 | // Get one candiate string. If full sentence candidate is available, it will 432 | // be the first one. 433 | char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len); 434 | 435 | // Get the first candiate, which is a "full sentence". 436 | // retstr_len is not NULL, it will be used to return the string length. 437 | // If only_unfixed is true, only unfixed part will be fetched. 438 | char16* get_candidate0(char16* cand_str, size_t max_len, 439 | uint16 *retstr_len, bool only_unfixed); 440 | 441 | // Choose a candidate. The decoder will do a search after the fixed position. 442 | size_t choose(size_t cand_id); 443 | 444 | // Cancel the last choosing operation, and return the new number of choices. 445 | size_t cancel_last_choice(); 446 | 447 | // Get the length of fixed Hanzis. 448 | size_t get_fixedlen(); 449 | 450 | size_t get_predicts(const char16 fixed_buf[], 451 | char16 predict_buf[][kMaxPredictSize + 1], 452 | size_t buf_len); 453 | }; 454 | } 455 | 456 | #endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ 457 | -------------------------------------------------------------------------------- /pinyin/mystdlib.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | 19 | namespace ime_pinyin { 20 | 21 | // For debug purpose. You can add a fixed version of qsort and bsearch functions 22 | // here so that the output will be totally the same under different platforms. 23 | 24 | void myqsort(void *p, size_t n, size_t es, 25 | int (*cmp)(const void *, const void *)) { 26 | qsort(p,n, es, cmp); 27 | } 28 | 29 | void *mybsearch(const void *k, const void *b, 30 | size_t n, size_t es, 31 | int (*cmp)(const void *, const void *)) { 32 | return bsearch(k, b, n, es, cmp); 33 | } 34 | } // namespace ime_pinyin 35 | -------------------------------------------------------------------------------- /pinyin/mystdlib.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_MYSTDLIB_H__ 18 | #define PINYINIME_INCLUDE_MYSTDLIB_H__ 19 | 20 | #include 21 | 22 | namespace ime_pinyin { 23 | 24 | void myqsort(void *p, size_t n, size_t es, 25 | int (*cmp)(const void *, const void *)); 26 | 27 | void *mybsearch(const void *key, const void *base, 28 | size_t nmemb, size_t size, 29 | int (*compar)(const void *, const void *)); 30 | } 31 | 32 | #endif // PINYINIME_INCLUDE_MYSTDLIB_H__ 33 | -------------------------------------------------------------------------------- /pinyin/ngram.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "mystdlib.h" 23 | #include "ngram.h" 24 | 25 | namespace ime_pinyin { 26 | 27 | #define ADD_COUNT 0.3 28 | 29 | int comp_double(const void *p1, const void *p2) { 30 | if (*static_cast(p1) < *static_cast(p2)) 31 | return -1; 32 | if (*static_cast(p1) > *static_cast(p2)) 33 | return 1; 34 | return 0; 35 | } 36 | 37 | inline double distance(double freq, double code) { 38 | // return fabs(freq - code); 39 | return freq * fabs(log(freq) - log(code)); 40 | } 41 | 42 | // Find the index of the code value which is nearest to the given freq 43 | int qsearch_nearest(double code_book[], double freq, int start, int end) { 44 | if (start == end) 45 | return start; 46 | 47 | if (start + 1 == end) { 48 | if (distance(freq, code_book[end]) > distance(freq, code_book[start])) 49 | return start; 50 | return end; 51 | } 52 | 53 | int mid = (start + end) / 2; 54 | 55 | if (code_book[mid] > freq) 56 | return qsearch_nearest(code_book, freq, start, mid); 57 | else 58 | return qsearch_nearest(code_book, freq, mid, end); 59 | } 60 | 61 | size_t update_code_idx(double freqs[], size_t num, double code_book[], 62 | CODEBOOK_TYPE *code_idx) { 63 | size_t changed = 0; 64 | for (size_t pos = 0; pos < num; pos++) { 65 | CODEBOOK_TYPE idx; 66 | idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); 67 | if (idx != code_idx[pos]) 68 | changed++; 69 | code_idx[pos] = idx; 70 | } 71 | return changed; 72 | } 73 | 74 | double recalculate_kernel(double freqs[], size_t num, double code_book[], 75 | CODEBOOK_TYPE *code_idx) { 76 | double ret = 0; 77 | 78 | size_t *item_num = new size_t[kCodeBookSize]; 79 | assert(item_num); 80 | memset(item_num, 0, sizeof(size_t) * kCodeBookSize); 81 | 82 | double *cb_new = new double[kCodeBookSize]; 83 | assert(cb_new); 84 | memset(cb_new, 0, sizeof(double) * kCodeBookSize); 85 | 86 | for (size_t pos = 0; pos < num; pos++) { 87 | ret += distance(freqs[pos], code_book[code_idx[pos]]); 88 | 89 | cb_new[code_idx[pos]] += freqs[pos]; 90 | item_num[code_idx[pos]] += 1; 91 | } 92 | 93 | for (size_t code = 0; code < kCodeBookSize; code++) { 94 | assert(item_num[code] > 0); 95 | code_book[code] = cb_new[code] / item_num[code]; 96 | } 97 | 98 | delete [] item_num; 99 | delete [] cb_new; 100 | 101 | return ret; 102 | } 103 | 104 | void iterate_codes(double freqs[], size_t num, double code_book[], 105 | CODEBOOK_TYPE *code_idx) { 106 | size_t iter_num = 0; 107 | double delta_last = 0; 108 | do { 109 | size_t changed = update_code_idx(freqs, num, code_book, code_idx); 110 | 111 | double delta = recalculate_kernel(freqs, num, code_book, code_idx); 112 | 113 | if (kPrintDebug0) { 114 | printf("---Unigram codebook iteration: %zd : %zd, %.9f\n", 115 | iter_num, changed, delta); 116 | } 117 | iter_num++; 118 | 119 | if (iter_num > 1 && 120 | (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) 121 | break; 122 | delta_last = delta; 123 | } while (true); 124 | } 125 | 126 | 127 | NGram* NGram::instance_ = NULL; 128 | 129 | NGram::NGram() { 130 | initialized_ = false; 131 | idx_num_ = 0; 132 | lma_freq_idx_ = NULL; 133 | sys_score_compensation_ = 0; 134 | 135 | #ifdef ___BUILD_MODEL___ 136 | freq_codes_df_ = NULL; 137 | #endif 138 | freq_codes_ = NULL; 139 | } 140 | 141 | NGram::~NGram() { 142 | if (NULL != lma_freq_idx_) 143 | free(lma_freq_idx_); 144 | 145 | #ifdef ___BUILD_MODEL___ 146 | if (NULL != freq_codes_df_) 147 | free(freq_codes_df_); 148 | #endif 149 | 150 | if (NULL != freq_codes_) 151 | free(freq_codes_); 152 | } 153 | 154 | NGram& NGram::get_instance() { 155 | if (NULL == instance_) 156 | instance_ = new NGram(); 157 | return *instance_; 158 | } 159 | 160 | bool NGram::save_ngram(FILE *fp) { 161 | if (!initialized_ || NULL == fp) 162 | return false; 163 | 164 | if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) 165 | return false; 166 | 167 | if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) 168 | return false; 169 | 170 | if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != 171 | kCodeBookSize) 172 | return false; 173 | 174 | if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) 175 | return false; 176 | 177 | return true; 178 | } 179 | 180 | bool NGram::load_ngram(FILE *fp) { 181 | if (NULL == fp) 182 | return false; 183 | 184 | initialized_ = false; 185 | 186 | if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) 187 | return false; 188 | 189 | if (NULL != lma_freq_idx_) 190 | free(lma_freq_idx_); 191 | 192 | if (NULL != freq_codes_) 193 | free(freq_codes_); 194 | 195 | lma_freq_idx_ = static_cast 196 | (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); 197 | freq_codes_ = static_cast 198 | (malloc(kCodeBookSize * sizeof(LmaScoreType))); 199 | 200 | if (NULL == lma_freq_idx_ || NULL == freq_codes_) 201 | return false; 202 | 203 | if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != 204 | kCodeBookSize) 205 | return false; 206 | 207 | if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) 208 | return false; 209 | 210 | initialized_ = true; 211 | 212 | total_freq_none_sys_ = 0; 213 | return true; 214 | } 215 | 216 | void NGram::set_total_freq_none_sys(size_t freq_none_sys) { 217 | total_freq_none_sys_ = freq_none_sys; 218 | if (0 == total_freq_none_sys_) { 219 | sys_score_compensation_ = 0; 220 | } else { 221 | double factor = static_cast(kSysDictTotalFreq) / ( 222 | kSysDictTotalFreq + total_freq_none_sys_); 223 | sys_score_compensation_ = static_cast( 224 | log(factor) * kLogValueAmplifier); 225 | } 226 | } 227 | 228 | // The caller makes sure this oject is initialized. 229 | float NGram::get_uni_psb(LemmaIdType lma_id) { 230 | return static_cast(freq_codes_[lma_freq_idx_[lma_id]]) + 231 | sys_score_compensation_; 232 | } 233 | 234 | float NGram::convert_psb_to_score(double psb) { 235 | float score = static_cast( 236 | log(psb) * static_cast(kLogValueAmplifier)); 237 | if (score > static_cast(kMaxScore)) { 238 | score = static_cast(kMaxScore); 239 | } 240 | return score; 241 | } 242 | 243 | #ifdef ___BUILD_MODEL___ 244 | bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, 245 | LemmaIdType next_idx_unused) { 246 | if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) 247 | return false; 248 | 249 | double total_freq = 0; 250 | double *freqs = new double[next_idx_unused]; 251 | if (NULL == freqs) 252 | return false; 253 | 254 | freqs[0] = ADD_COUNT; 255 | total_freq += freqs[0]; 256 | LemmaIdType idx_now = 0; 257 | for (size_t pos = 0; pos < lemma_num; pos++) { 258 | if (lemma_arr[pos].idx_by_hz == idx_now) 259 | continue; 260 | idx_now++; 261 | 262 | assert(lemma_arr[pos].idx_by_hz == idx_now); 263 | 264 | freqs[idx_now] = lemma_arr[pos].freq; 265 | if (freqs[idx_now] <= 0) 266 | freqs[idx_now] = 0.3; 267 | 268 | total_freq += freqs[idx_now]; 269 | } 270 | 271 | double max_freq = 0; 272 | idx_num_ = idx_now + 1; 273 | assert(idx_now + 1 == next_idx_unused); 274 | 275 | for (size_t pos = 0; pos < idx_num_; pos++) { 276 | freqs[pos] = freqs[pos] / total_freq; 277 | assert(freqs[pos] > 0); 278 | if (freqs[pos] > max_freq) 279 | max_freq = freqs[pos]; 280 | } 281 | 282 | // calculate the code book 283 | if (NULL == freq_codes_df_) 284 | freq_codes_df_ = new double[kCodeBookSize]; 285 | assert(freq_codes_df_); 286 | memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); 287 | 288 | if (NULL == freq_codes_) 289 | freq_codes_ = new LmaScoreType[kCodeBookSize]; 290 | assert(freq_codes_); 291 | memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); 292 | 293 | size_t freq_pos = 0; 294 | for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { 295 | bool found = true; 296 | 297 | while (found) { 298 | found = false; 299 | double cand = freqs[freq_pos]; 300 | for (size_t i = 0; i < code_pos; i++) 301 | if (freq_codes_df_[i] == cand) { 302 | found = true; 303 | break; 304 | } 305 | if (found) 306 | freq_pos++; 307 | } 308 | 309 | freq_codes_df_[code_pos] = freqs[freq_pos]; 310 | freq_pos++; 311 | } 312 | 313 | myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); 314 | 315 | if (NULL == lma_freq_idx_) 316 | lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; 317 | assert(lma_freq_idx_); 318 | 319 | iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); 320 | 321 | delete [] freqs; 322 | 323 | if (kPrintDebug0) { 324 | printf("\n------Language Model Unigram Codebook------\n"); 325 | } 326 | 327 | for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { 328 | double log_score = log(freq_codes_df_[code_pos]); 329 | float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); 330 | if (kPrintDebug0) { 331 | printf("code:%zd, probability:%.9f, log score:%.3f, final score: %.3f\n", 332 | code_pos, freq_codes_df_[code_pos], log_score, final_score); 333 | } 334 | freq_codes_[code_pos] = static_cast(final_score); 335 | } 336 | 337 | initialized_ = true; 338 | return true; 339 | } 340 | #endif 341 | 342 | } // namespace ime_pinyin 343 | -------------------------------------------------------------------------------- /pinyin/ngram.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_NGRAM_H__ 18 | #define PINYINIME_INCLUDE_NGRAM_H__ 19 | 20 | #include 21 | #include 22 | #include "./dictdef.h" 23 | 24 | namespace ime_pinyin { 25 | 26 | typedef unsigned char CODEBOOK_TYPE; 27 | 28 | static const size_t kCodeBookSize = 256; 29 | 30 | class NGram { 31 | public: 32 | // The maximum score of a lemma item. 33 | static const LmaScoreType kMaxScore = 0x3fff; 34 | 35 | // In order to reduce the storage size, the original log value is amplified by 36 | // kScoreAmplifier, and we use LmaScoreType to store. 37 | // After this process, an item with a lower score has a higher frequency. 38 | static const int kLogValueAmplifier = -800; 39 | 40 | // System words' total frequency. It is not the real total frequency, instead, 41 | // It is only used to adjust system lemmas' scores when the user dictionary's 42 | // total frequency changes. 43 | // In this version, frequencies of system lemmas are fixed. We are considering 44 | // to make them changable in next version. 45 | static const size_t kSysDictTotalFreq = 100000000; 46 | 47 | private: 48 | 49 | static NGram* instance_; 50 | 51 | bool initialized_; 52 | size_t idx_num_; 53 | 54 | size_t total_freq_none_sys_; 55 | 56 | // Score compensation for system dictionary lemmas. 57 | // Because after user adds some user lemmas, the total frequency changes, and 58 | // we use this value to normalize the score. 59 | float sys_score_compensation_; 60 | 61 | #ifdef ___BUILD_MODEL___ 62 | double *freq_codes_df_; 63 | #endif 64 | LmaScoreType *freq_codes_; 65 | CODEBOOK_TYPE *lma_freq_idx_; 66 | 67 | public: 68 | NGram(); 69 | ~NGram(); 70 | 71 | static NGram& get_instance(); 72 | 73 | bool save_ngram(FILE *fp); 74 | bool load_ngram(FILE *fp); 75 | 76 | // Set the total frequency of all none system dictionaries. 77 | void set_total_freq_none_sys(size_t freq_none_sys); 78 | 79 | float get_uni_psb(LemmaIdType lma_id); 80 | 81 | // Convert a probability to score. Actually, the score will be limited to 82 | // kMaxScore, but at runtime, we also need float expression to get accurate 83 | // value of the score. 84 | // After the conversion, a lower score indicates a higher probability of the 85 | // item. 86 | static float convert_psb_to_score(double psb); 87 | 88 | #ifdef ___BUILD_MODEL___ 89 | // For constructing the unigram mode model. 90 | bool build_unigram(LemmaEntry *lemma_arr, size_t num, 91 | LemmaIdType next_idx_unused); 92 | #endif 93 | }; 94 | } 95 | 96 | #endif // PINYINIME_INCLUDE_NGRAM_H__ 97 | -------------------------------------------------------------------------------- /pinyin/pinyinime.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "pinyinime.h" 19 | #include "dicttrie.h" 20 | #include "matrixsearch.h" 21 | #include "spellingtrie.h" 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | using namespace ime_pinyin; 28 | 29 | // The maximum number of the prediction items. 30 | static const size_t kMaxPredictNum = 500; 31 | 32 | // Used to search Pinyin string and give the best candidate. 33 | MatrixSearch* matrix_search = NULL; 34 | 35 | char16 predict_buf[kMaxPredictNum][kMaxPredictSize + 1]; 36 | 37 | bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict) { 38 | if (NULL != matrix_search) 39 | delete matrix_search; 40 | 41 | matrix_search = new MatrixSearch(); 42 | if (NULL == matrix_search) { 43 | return false; 44 | } 45 | 46 | return matrix_search->init(fn_sys_dict, fn_usr_dict); 47 | } 48 | 49 | bool im_open_decoder_fd(int sys_fd, long start_offset, long length, 50 | const char *fn_usr_dict) { 51 | if (NULL != matrix_search) 52 | delete matrix_search; 53 | 54 | matrix_search = new MatrixSearch(); 55 | if (NULL == matrix_search) 56 | return false; 57 | 58 | return matrix_search->init_fd(sys_fd, start_offset, length, fn_usr_dict); 59 | } 60 | 61 | void im_close_decoder() { 62 | if (NULL != matrix_search) { 63 | matrix_search->close(); 64 | delete matrix_search; 65 | } 66 | matrix_search = NULL; 67 | } 68 | 69 | void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len) { 70 | if (NULL != matrix_search) { 71 | matrix_search->set_max_lens(max_sps_len, max_hzs_len); 72 | } 73 | } 74 | 75 | void im_flush_cache() { 76 | if (NULL != matrix_search) 77 | matrix_search->flush_cache(); 78 | } 79 | 80 | // To be updated. 81 | size_t im_search(const char* pybuf, size_t pylen) { 82 | if (NULL == matrix_search) 83 | return 0; 84 | 85 | matrix_search->search(pybuf, pylen); 86 | return matrix_search->get_candidate_num(); 87 | } 88 | 89 | size_t im_delsearch(size_t pos, bool is_pos_in_splid, 90 | bool clear_fixed_this_step) { 91 | if (NULL == matrix_search) 92 | return 0; 93 | matrix_search->delsearch(pos, is_pos_in_splid, clear_fixed_this_step); 94 | return matrix_search->get_candidate_num(); 95 | } 96 | 97 | void im_reset_search() { 98 | if (NULL == matrix_search) 99 | return; 100 | 101 | matrix_search->reset_search(); 102 | } 103 | 104 | // To be removed 105 | size_t im_add_letter(char ch) { 106 | return 0; 107 | } 108 | 109 | const char* im_get_sps_str(size_t *decoded_len) { 110 | if (NULL == matrix_search) 111 | return NULL; 112 | 113 | return matrix_search->get_pystr(decoded_len); 114 | } 115 | 116 | char16* im_get_candidate(size_t cand_id, char16* cand_str, 117 | size_t max_len) { 118 | if (NULL == matrix_search) 119 | return NULL; 120 | 121 | return matrix_search->get_candidate(cand_id, cand_str, max_len); 122 | } 123 | 124 | size_t im_get_spl_start_pos(const uint16 *&spl_start) { 125 | if (NULL == matrix_search) 126 | return 0; 127 | 128 | return matrix_search->get_spl_start(spl_start); 129 | } 130 | 131 | size_t im_choose(size_t choice_id) { 132 | if (NULL == matrix_search) 133 | return 0; 134 | 135 | return matrix_search->choose(choice_id); 136 | } 137 | 138 | size_t im_cancel_last_choice() { 139 | if (NULL == matrix_search) 140 | return 0; 141 | 142 | return matrix_search->cancel_last_choice(); 143 | } 144 | 145 | size_t im_get_fixed_len() { 146 | if (NULL == matrix_search) 147 | return 0; 148 | 149 | return matrix_search->get_fixedlen(); 150 | } 151 | 152 | // To be removed 153 | bool im_cancel_input() { 154 | return true; 155 | } 156 | 157 | 158 | size_t im_get_predicts(const char16 *his_buf, 159 | char16 (*&pre_buf)[kMaxPredictSize + 1]) { 160 | if (NULL == his_buf) 161 | return 0; 162 | 163 | size_t fixed_len = utf16_strlen(his_buf); 164 | const char16 *fixed_ptr = his_buf; 165 | if (fixed_len > kMaxPredictSize) { 166 | fixed_ptr += fixed_len - kMaxPredictSize; 167 | fixed_len = kMaxPredictSize; 168 | } 169 | 170 | pre_buf = predict_buf; 171 | return matrix_search->get_predicts(his_buf, pre_buf, kMaxPredictNum); 172 | } 173 | 174 | void im_enable_shm_as_szm(bool enable) { 175 | SpellingTrie &spl_trie = SpellingTrie::get_instance(); 176 | spl_trie.szm_enable_shm(enable); 177 | } 178 | 179 | void im_enable_ym_as_szm(bool enable) { 180 | SpellingTrie &spl_trie = SpellingTrie::get_instance(); 181 | spl_trie.szm_enable_ym(enable); 182 | } 183 | 184 | #ifdef __cplusplus 185 | } 186 | #endif 187 | -------------------------------------------------------------------------------- /pinyin/pinyinime.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_ANDPYIME_H__ 18 | #define PINYINIME_INCLUDE_ANDPYIME_H__ 19 | 20 | #include 21 | #include "./dictdef.h" 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | namespace ime_pinyin { 28 | 29 | /** 30 | * Open the decoder engine via the system and user dictionary file names. 31 | * 32 | * @param fn_sys_dict The file name of the system dictionary. 33 | * @param fn_usr_dict The file name of the user dictionary. 34 | * @return true if open the decoder engine successfully. 35 | */ 36 | bool im_open_decoder(const char *fn_sys_dict, const char *fn_usr_dict); 37 | 38 | /** 39 | * Open the decoder engine via the system dictionary FD and user dictionary 40 | * file name. Because on Android, the system dictionary is embedded in the 41 | * whole application apk file. 42 | * 43 | * @param sys_fd The file in which the system dictionary is embedded. 44 | * @param start_offset The starting position of the system dictionary in the 45 | * file sys_fd. 46 | * @param length The length of the system dictionary in the file sys_fd, 47 | * counted in byte. 48 | * @return true if succeed. 49 | */ 50 | bool im_open_decoder_fd(int sys_fd, long start_offset, long length, 51 | const char *fn_usr_dict); 52 | 53 | /** 54 | * Close the decoder engine. 55 | */ 56 | void im_close_decoder(); 57 | 58 | /** 59 | * Set maximum limitations for decoding. If this function is not called, 60 | * default values will be used. For example, due to screen size limitation, 61 | * the UI engine of the IME can only show a certain number of letters(input) 62 | * to decode, and a certain number of Chinese characters(output). If after 63 | * user adds a new letter, the input or the output string is longer than the 64 | * limitations, the engine will discard the recent letter. 65 | * 66 | * @param max_sps_len Maximum length of the spelling string(Pinyin string). 67 | * @max_hzs_len Maximum length of the decoded Chinese character string. 68 | */ 69 | void im_set_max_lens(size_t max_sps_len, size_t max_hzs_len); 70 | 71 | /** 72 | * Flush cached data to persistent memory. Because at runtime, in order to 73 | * achieve best performance, some data is only store in memory. 74 | */ 75 | void im_flush_cache(); 76 | 77 | /** 78 | * Use a spelling string(Pinyin string) to search. The engine will try to do 79 | * an incremental search based on its previous search result, so if the new 80 | * string has the same prefix with the previous one stored in the decoder, 81 | * the decoder will only continue the search from the end of the prefix. 82 | * If the caller needs to do a brand new search, please call im_reset_search() 83 | * first. Calling im_search() is equivalent to calling im_add_letter() one by 84 | * one. 85 | * 86 | * @param sps_buf The spelling string buffer to decode. 87 | * @param sps_len The length of the spelling string buffer. 88 | * @return The number of candidates. 89 | */ 90 | size_t im_search(const char* sps_buf, size_t sps_len); 91 | 92 | /** 93 | * Make a delete operation in the current search result, and make research if 94 | * necessary. 95 | * 96 | * @param pos The posistion of char in spelling string to delete, or the 97 | * position of spelling id in result string to delete. 98 | * @param is_pos_in_splid Indicate whether the pos parameter is the position 99 | * in the spelling string, or the position in the result spelling id string. 100 | * @return The number of candidates. 101 | */ 102 | size_t im_delsearch(size_t pos, bool is_pos_in_splid, 103 | bool clear_fixed_this_step); 104 | 105 | /** 106 | * Reset the previous search result. 107 | */ 108 | void im_reset_search(); 109 | 110 | /** 111 | * Add a Pinyin letter to the current spelling string kept by decoder. If the 112 | * decoder fails in adding the letter, it will do nothing. im_get_sps_str() 113 | * can be used to get the spelling string kept by decoder currently. 114 | * 115 | * @param ch The letter to add. 116 | * @return The number of candidates. 117 | */ 118 | size_t im_add_letter(char ch); 119 | 120 | /** 121 | * Get the spelling string kept by the decoder. 122 | * 123 | * @param decoded_len Used to return how many characters in the spelling 124 | * string is successfully parsed. 125 | * @return The spelling string kept by the decoder. 126 | */ 127 | const char *im_get_sps_str(size_t *decoded_len); 128 | 129 | /** 130 | * Get a candidate(or choice) string. 131 | * 132 | * @param cand_id The id to get a candidate. Started from 0. Usually, id 0 133 | * is a sentence-level candidate. 134 | * @param cand_str The buffer to store the candidate. 135 | * @param max_len The maximum length of the buffer. 136 | * @return cand_str if succeeds, otherwise NULL. 137 | */ 138 | char16* im_get_candidate(size_t cand_id, char16* cand_str, 139 | size_t max_len); 140 | 141 | /** 142 | * Get the segmentation information(the starting positions) of the spelling 143 | * string. 144 | * 145 | * @param spl_start Used to return the starting posistions. 146 | * @return The number of spelling ids. If it is L, there will be L+1 valid 147 | * elements in spl_start, and spl_start[L] is the posistion after the end of 148 | * the last spelling id. 149 | */ 150 | size_t im_get_spl_start_pos(const uint16 *&spl_start); 151 | 152 | /** 153 | * Choose a candidate and make it fixed. If the candidate does not match 154 | * the end of all spelling ids, new candidates will be provided from the 155 | * first unfixed position. If the candidate matches the end of the all 156 | * spelling ids, there will be only one new candidates, or the whole fixed 157 | * sentence. 158 | * 159 | * @param cand_id The id of candidate to select and make it fixed. 160 | * @return The number of candidates. If after the selection, the whole result 161 | * string has been fixed, there will be only one candidate. 162 | */ 163 | size_t im_choose(size_t cand_id); 164 | 165 | /** 166 | * Cancel the last selection, or revert the last operation of im_choose(). 167 | * 168 | * @return The number of candidates. 169 | */ 170 | size_t im_cancel_last_choice(); 171 | 172 | /** 173 | * Get the number of fixed spelling ids, or Chinese characters. 174 | * 175 | * @return The number of fixed spelling ids, of Chinese characters. 176 | */ 177 | size_t im_get_fixed_len(); 178 | 179 | /** 180 | * Cancel the input state and reset the search workspace. 181 | */ 182 | bool im_cancel_input(); 183 | 184 | /** 185 | * Get prediction candiates based on the given fixed Chinese string as the 186 | * history. 187 | * 188 | * @param his_buf The history buffer to do the prediction. It should be ended 189 | * with '\0'. 190 | * @param pre_buf Used to return prediction result list. 191 | * @return The number of predicted result string. 192 | */ 193 | size_t im_get_predicts(const char16 *his_buf, 194 | char16 (*&pre_buf)[kMaxPredictSize + 1]); 195 | 196 | /** 197 | * Enable Shengmus in ShouZiMu mode. 198 | */ 199 | void im_enable_shm_as_szm(bool enable); 200 | 201 | /** 202 | * Enable Yunmus in ShouZiMu mode. 203 | */ 204 | void im_enable_ym_as_szm(bool enable); 205 | } 206 | 207 | #ifdef __cplusplus 208 | } 209 | #endif 210 | 211 | #endif // PINYINIME_INCLUDE_ANDPYIME_H__ 212 | -------------------------------------------------------------------------------- /pinyin/searchutility.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "mystdlib.h" 19 | #include "searchutility.h" 20 | 21 | namespace ime_pinyin { 22 | 23 | bool is_system_lemma(LemmaIdType lma_id) { 24 | return (0 < lma_id && lma_id <= kSysDictIdEnd); 25 | } 26 | 27 | bool is_user_lemma(LemmaIdType lma_id) { 28 | return (kUserDictIdStart <= lma_id && lma_id <= kUserDictIdEnd); 29 | } 30 | 31 | bool is_composing_lemma(LemmaIdType lma_id) { 32 | return (kLemmaIdComposing == lma_id); 33 | } 34 | 35 | int cmp_lpi_with_psb(const void *p1, const void *p2) { 36 | if ((static_cast(p1))->psb > 37 | (static_cast(p2))->psb) 38 | return 1; 39 | if ((static_cast(p1))->psb < 40 | (static_cast(p2))->psb) 41 | return -1; 42 | return 0; 43 | } 44 | 45 | int cmp_lpi_with_unified_psb(const void *p1, const void *p2) { 46 | const LmaPsbItem *item1 = static_cast(p1); 47 | const LmaPsbItem *item2 = static_cast(p2); 48 | 49 | // The real unified psb is psb1 / lma_len1 and psb2 * lma_len2 50 | // But we use psb1 * lma_len2 and psb2 * lma_len1 to get better 51 | // precision. 52 | size_t up1 = item1->psb * (item2->lma_len); 53 | size_t up2 = item2->psb * (item1->lma_len); 54 | if (up1 < up2) { 55 | return -1; 56 | } 57 | if (up1 > up2) { 58 | return 1; 59 | } 60 | return 0; 61 | } 62 | 63 | int cmp_lpi_with_id(const void *p1, const void *p2) { 64 | if ((static_cast(p1))->id < 65 | (static_cast(p2))->id) 66 | return -1; 67 | if ((static_cast(p1))->id > 68 | (static_cast(p2))->id) 69 | return 1; 70 | return 0; 71 | } 72 | 73 | int cmp_lpi_with_hanzi(const void *p1, const void *p2) { 74 | if ((static_cast(p1))->hanzi < 75 | (static_cast(p2))->hanzi) 76 | return -1; 77 | if ((static_cast(p1))->hanzi > 78 | (static_cast(p2))->hanzi) 79 | return 1; 80 | 81 | return 0; 82 | } 83 | 84 | int cmp_lpsi_with_str(const void *p1, const void *p2) { 85 | return utf16_strcmp((static_cast(p1))->str, 86 | (static_cast(p2))->str); 87 | } 88 | 89 | 90 | int cmp_hanzis_1(const void *p1, const void *p2) { 91 | if (*static_cast(p1) < 92 | *static_cast(p2)) 93 | return -1; 94 | 95 | if (*static_cast(p1) > 96 | *static_cast(p2)) 97 | return 1; 98 | return 0; 99 | } 100 | 101 | int cmp_hanzis_2(const void *p1, const void *p2) { 102 | return utf16_strncmp(static_cast(p1), 103 | static_cast(p2), 2); 104 | } 105 | 106 | int cmp_hanzis_3(const void *p1, const void *p2) { 107 | return utf16_strncmp(static_cast(p1), 108 | static_cast(p2), 3); 109 | } 110 | 111 | int cmp_hanzis_4(const void *p1, const void *p2) { 112 | return utf16_strncmp(static_cast(p1), 113 | static_cast(p2), 4); 114 | } 115 | 116 | int cmp_hanzis_5(const void *p1, const void *p2) { 117 | return utf16_strncmp(static_cast(p1), 118 | static_cast(p2), 5); 119 | } 120 | 121 | int cmp_hanzis_6(const void *p1, const void *p2) { 122 | return utf16_strncmp(static_cast(p1), 123 | static_cast(p2), 6); 124 | } 125 | 126 | int cmp_hanzis_7(const void *p1, const void *p2) { 127 | return utf16_strncmp(static_cast(p1), 128 | static_cast(p2), 7); 129 | } 130 | 131 | int cmp_hanzis_8(const void *p1, const void *p2) { 132 | return utf16_strncmp(static_cast(p1), 133 | static_cast(p2), 8); 134 | } 135 | 136 | int cmp_npre_by_score(const void *p1, const void *p2) { 137 | if ((static_cast(p1))->psb > 138 | (static_cast(p2))->psb) 139 | return 1; 140 | 141 | if ((static_cast(p1))->psb < 142 | (static_cast(p2))->psb) 143 | return -1; 144 | 145 | return 0; 146 | } 147 | 148 | int cmp_npre_by_hislen_score(const void *p1, const void *p2) { 149 | if ((static_cast(p1))->his_len < 150 | (static_cast(p2))->his_len) 151 | return 1; 152 | 153 | if ((static_cast(p1))->his_len > 154 | (static_cast(p2))->his_len) 155 | return -1; 156 | 157 | if ((static_cast(p1))->psb > 158 | (static_cast(p2))->psb) 159 | return 1; 160 | 161 | if ((static_cast(p1))->psb < 162 | (static_cast(p2))->psb) 163 | return -1; 164 | 165 | return 0; 166 | } 167 | 168 | int cmp_npre_by_hanzi_score(const void *p1, const void *p2) { 169 | int ret_v = (utf16_strncmp((static_cast(p1))->pre_hzs, 170 | (static_cast(p2))->pre_hzs, kMaxPredictSize)); 171 | if (0 != ret_v) 172 | return ret_v; 173 | 174 | if ((static_cast(p1))->psb > 175 | (static_cast(p2))->psb) 176 | return 1; 177 | 178 | if ((static_cast(p1))->psb < 179 | (static_cast(p2))->psb) 180 | return -1; 181 | 182 | return 0; 183 | } 184 | 185 | size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num) { 186 | if (NULL == npre_items || 0 == npre_num) 187 | return 0; 188 | 189 | myqsort(npre_items, npre_num, sizeof(NPredictItem), cmp_npre_by_hanzi_score); 190 | 191 | size_t remain_num = 1; // The first one is reserved. 192 | for (size_t pos = 1; pos < npre_num; pos++) { 193 | if (utf16_strncmp(npre_items[pos].pre_hzs, 194 | npre_items[remain_num - 1].pre_hzs, 195 | kMaxPredictSize) != 0) { 196 | if (remain_num != pos) { 197 | npre_items[remain_num] = npre_items[pos]; 198 | } 199 | remain_num++; 200 | } 201 | } 202 | return remain_num; 203 | } 204 | 205 | size_t align_to_size_t(size_t size) { 206 | size_t s = sizeof(size_t); 207 | return (size + s -1) / s * s; 208 | } 209 | 210 | } // namespace ime_pinyin 211 | -------------------------------------------------------------------------------- /pinyin/searchutility.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ 18 | #define PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ 19 | 20 | #include 21 | #include "./spellingtrie.h" 22 | 23 | namespace ime_pinyin { 24 | 25 | // Type used to identify the size of a pool, such as id pool, etc. 26 | typedef uint16 PoolPosType; 27 | 28 | // Type used to identify a parsing mile stone in an atom dictionary. 29 | typedef uint16 MileStoneHandle; 30 | 31 | // Type used to express a lemma and its probability score. 32 | typedef struct { 33 | size_t id:(kLemmaIdSize * 8); 34 | size_t lma_len:4; 35 | uint16 psb; // The score, the lower psb, the higher possibility. 36 | // For single character items, we may also need Hanzi. 37 | // For multiple characer items, ignore it. 38 | char16 hanzi; 39 | } LmaPsbItem, *PLmaPsbItem; 40 | 41 | // LmaPsbItem extended with string. 42 | typedef struct { 43 | LmaPsbItem lpi; 44 | char16 str[kMaxLemmaSize + 1]; 45 | } LmaPsbStrItem, *PLmaPsbStrItem; 46 | 47 | 48 | typedef struct { 49 | float psb; 50 | char16 pre_hzs[kMaxPredictSize]; 51 | uint16 his_len; // The length of the history used to do the prediction. 52 | } NPredictItem, *PNPredictItem; 53 | 54 | // Parameter structure used to extend in a dictionary. All dictionaries 55 | // receives the same DictExtPara and a dictionary specific MileStoneHandle for 56 | // extending. 57 | // 58 | // When the user inputs a new character, AtomDictBase::extend_dict() will be 59 | // called at least once for each dictionary. 60 | // 61 | // For example, when the user inputs "wm", extend_dict() will be called twice, 62 | // and the DictExtPara parameter are as follows respectively: 63 | // 1. splids = {w, m}; splids_extended = 1; ext_len = 1; step_no = 1; 64 | // splid_end_split = false; id_start = wa(the first id start with 'w'); 65 | // id_num = number of ids starting with 'w'. 66 | // 2. splids = {m}; splids_extended = 0; ext_len = 1; step_no = 1; 67 | // splid_end_split = false; id_start = wa; id_num = number of ids starting with 68 | // 'w'. 69 | // 70 | // For string "women", one of the cases of the DictExtPara parameter is: 71 | // splids = {wo, men}, splids_extended = 1, ext_len = 3 (length of "men"), 72 | // step_no = 4; splid_end_split = false; id_start = men, id_num = 1. 73 | // 74 | typedef struct { 75 | // Spelling ids for extending, there are splids_extended + 1 ids in the 76 | // buffer. 77 | // For a normal lemma, there can only be kMaxLemmaSize spelling ids in max, 78 | // but for a composing phrase, there can kMaxSearchSteps spelling ids. 79 | uint16 splids[kMaxSearchSteps]; 80 | 81 | // Number of ids that have been used before. splids[splids_extended] is the 82 | // newly added id for the current extension. 83 | uint16 splids_extended; 84 | 85 | // The step span of the extension. It is also the size of the string for 86 | // the newly added spelling id. 87 | uint16 ext_len; 88 | 89 | // The step number for the current extension. It is also the ending position 90 | // in the input Pinyin string for the substring of spelling ids in splids[]. 91 | // For example, when the user inputs "women", step_no = 4. 92 | // This parameter may useful to manage the MileStoneHandle list for each 93 | // step. When the user deletes a character from the string, MileStoneHandle 94 | // objects for the the steps after that character should be reset; when the 95 | // user begins a new string, all MileStoneHandle objects should be reset. 96 | uint16 step_no; 97 | 98 | // Indicate whether the newly added spelling ends with a splitting character 99 | bool splid_end_split; 100 | 101 | // If the newly added id is a half id, id_start is the first id of the 102 | // corresponding full ids; if the newly added id is a full id, id_start is 103 | // that id. 104 | uint16 id_start; 105 | 106 | // If the newly added id is a half id, id_num is the number of corresponding 107 | // ids; if it is a full id, id_num == 1. 108 | uint16 id_num; 109 | }DictExtPara, *PDictExtPara; 110 | 111 | bool is_system_lemma(LemmaIdType lma_id); 112 | bool is_user_lemma(LemmaIdType lma_id); 113 | bool is_composing_lemma(LemmaIdType lma_id); 114 | 115 | int cmp_lpi_with_psb(const void *p1, const void *p2); 116 | int cmp_lpi_with_unified_psb(const void *p1, const void *p2); 117 | int cmp_lpi_with_id(const void *p1, const void *p2); 118 | int cmp_lpi_with_hanzi(const void *p1, const void *p2); 119 | 120 | int cmp_lpsi_with_str(const void *p1, const void *p2); 121 | 122 | int cmp_hanzis_1(const void *p1, const void *p2); 123 | int cmp_hanzis_2(const void *p1, const void *p2); 124 | int cmp_hanzis_3(const void *p1, const void *p2); 125 | int cmp_hanzis_4(const void *p1, const void *p2); 126 | int cmp_hanzis_5(const void *p1, const void *p2); 127 | int cmp_hanzis_6(const void *p1, const void *p2); 128 | int cmp_hanzis_7(const void *p1, const void *p2); 129 | int cmp_hanzis_8(const void *p1, const void *p2); 130 | 131 | int cmp_npre_by_score(const void *p1, const void *p2); 132 | int cmp_npre_by_hislen_score(const void *p1, const void *p2); 133 | int cmp_npre_by_hanzi_score(const void *p1, const void *p2); 134 | 135 | 136 | size_t remove_duplicate_npre(NPredictItem *npre_items, size_t npre_num); 137 | 138 | size_t align_to_size_t(size_t size); 139 | 140 | } // namespace 141 | 142 | #endif // PINYINIME_ANDPY_INCLUDE_SEARCHCOMMON_H__ 143 | -------------------------------------------------------------------------------- /pinyin/spellingtable.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "spellingtable.h" 23 | 24 | namespace ime_pinyin { 25 | 26 | #ifdef ___BUILD_MODEL___ 27 | 28 | const char SpellingTable:: 29 | kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"}; 30 | 31 | // "" is the biggest, so that all empty strings will be moved to the end 32 | // _eb mean empty is biggest 33 | int compare_raw_spl_eb(const void* p1, const void* p2) { 34 | if ('\0' == (static_cast(p1))->str[0]) 35 | return 1; 36 | 37 | if ('\0' == (static_cast(p2))->str[0]) 38 | return -1; 39 | 40 | return strcmp((static_cast(p1))->str, 41 | (static_cast(p2))->str); 42 | } 43 | 44 | size_t get_odd_next(size_t value) { 45 | size_t v_next = value; 46 | while (true) { 47 | size_t v_next_sqrt = (size_t)sqrt(v_next); 48 | 49 | bool is_odd = true; 50 | for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) { 51 | if (v_next % v_dv == 0) { 52 | is_odd = false; 53 | break; 54 | } 55 | } 56 | 57 | if (is_odd) 58 | return v_next; 59 | 60 | v_next++; 61 | } 62 | 63 | // never reach here 64 | return 0; 65 | } 66 | 67 | SpellingTable::SpellingTable() { 68 | need_score_ = false; 69 | raw_spellings_ = NULL; 70 | spelling_buf_ = NULL; 71 | spelling_num_ = 0; 72 | total_freq_ = 0; 73 | frozen_ = true; 74 | } 75 | 76 | SpellingTable::~SpellingTable() { 77 | free_resource(); 78 | } 79 | 80 | size_t SpellingTable::get_hash_pos(const char* spelling_str) { 81 | size_t hash_pos = 0; 82 | for (size_t pos = 0; pos < spelling_size_; pos++) { 83 | if ('\0' == spelling_str[pos]) 84 | break; 85 | hash_pos += (size_t)spelling_str[pos]; 86 | } 87 | 88 | hash_pos = hash_pos % spelling_max_num_; 89 | return hash_pos; 90 | } 91 | 92 | size_t SpellingTable::hash_pos_next(size_t hash_pos) { 93 | hash_pos += 123; 94 | hash_pos = hash_pos % spelling_max_num_; 95 | return hash_pos; 96 | } 97 | 98 | void SpellingTable::free_resource() { 99 | if (NULL != raw_spellings_) 100 | delete [] raw_spellings_; 101 | raw_spellings_ = NULL; 102 | 103 | if (NULL != spelling_buf_) 104 | delete [] spelling_buf_; 105 | spelling_buf_ = NULL; 106 | } 107 | 108 | bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num, 109 | bool need_score) { 110 | if (pure_spl_size == 0 || spl_max_num ==0) 111 | return false; 112 | 113 | need_score_ = need_score; 114 | 115 | free_resource(); 116 | 117 | spelling_size_ = pure_spl_size + 1; 118 | if (need_score) 119 | spelling_size_ += 1; 120 | spelling_max_num_ = get_odd_next(spl_max_num); 121 | spelling_num_ = 0; 122 | 123 | raw_spellings_ = new RawSpelling[spelling_max_num_]; 124 | spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)]; 125 | if (NULL == raw_spellings_ || NULL == spelling_buf_) { 126 | free_resource(); 127 | return false; 128 | } 129 | 130 | memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling)); 131 | memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_)); 132 | frozen_ = false; 133 | total_freq_ = 0; 134 | return true; 135 | } 136 | 137 | bool SpellingTable::put_spelling(const char* spelling_str, double freq) { 138 | if (frozen_ || NULL == spelling_str) 139 | return false; 140 | 141 | for (size_t pos = 0; pos < kNotSupportNum; pos++) { 142 | if (strcmp(spelling_str, kNotSupportList[pos]) == 0) { 143 | return false; 144 | } 145 | } 146 | 147 | total_freq_ += freq; 148 | 149 | size_t hash_pos = get_hash_pos(spelling_str); 150 | 151 | raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; 152 | 153 | if (strncmp(raw_spellings_[hash_pos].str, spelling_str, 154 | spelling_size_ - 1) == 0) { 155 | raw_spellings_[hash_pos].freq += freq; 156 | return true; 157 | } 158 | 159 | size_t hash_pos_ori = hash_pos; 160 | 161 | while (true) { 162 | if (strncmp(raw_spellings_[hash_pos].str, 163 | spelling_str, spelling_size_ - 1) == 0) { 164 | raw_spellings_[hash_pos].freq += freq; 165 | return true; 166 | } 167 | 168 | if ('\0' == raw_spellings_[hash_pos].str[0]) { 169 | raw_spellings_[hash_pos].freq += freq; 170 | strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1); 171 | raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; 172 | spelling_num_++; 173 | return true; 174 | } 175 | 176 | hash_pos = hash_pos_next(hash_pos); 177 | if (hash_pos_ori == hash_pos) 178 | return false; 179 | } 180 | 181 | // never reach here 182 | return false; 183 | } 184 | 185 | bool SpellingTable::contain(const char* spelling_str) { 186 | if (NULL == spelling_str || NULL == spelling_buf_ || frozen_) 187 | return false; 188 | 189 | size_t hash_pos = get_hash_pos(spelling_str); 190 | 191 | if ('\0' == raw_spellings_[hash_pos].str[0]) 192 | return false; 193 | 194 | if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) 195 | == 0) 196 | return true; 197 | 198 | size_t hash_pos_ori = hash_pos; 199 | 200 | while (true) { 201 | hash_pos = hash_pos_next(hash_pos); 202 | if (hash_pos_ori == hash_pos) 203 | return false; 204 | 205 | if ('\0' == raw_spellings_[hash_pos].str[0]) 206 | return false; 207 | 208 | if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) 209 | == 0) 210 | return true; 211 | } 212 | 213 | // never reach here 214 | return false; 215 | } 216 | 217 | const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) { 218 | if (NULL == raw_spellings_ || NULL == spelling_buf_ || 219 | NULL == item_size || NULL == spl_num) 220 | return NULL; 221 | 222 | qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling), 223 | compare_raw_spl_eb); 224 | 225 | // After sorting, only the first spelling_num_ items are valid. 226 | // Copy them to the destination buffer. 227 | for (size_t pos = 0; pos < spelling_num_; pos++) { 228 | strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str, 229 | spelling_size_); 230 | } 231 | 232 | if (need_score_) { 233 | if (kPrintDebug0) 234 | printf("------------Spelling Possiblities--------------\n"); 235 | 236 | double max_score = 0; 237 | double min_score = 0; 238 | 239 | // After sorting, only the first spelling_num_ items are valid. 240 | for (size_t pos = 0; pos < spelling_num_; pos++) { 241 | raw_spellings_[pos].freq /= total_freq_; 242 | if (need_score_) { 243 | if (0 == pos) { 244 | max_score = raw_spellings_[0].freq; 245 | min_score = max_score; 246 | } else { 247 | if (raw_spellings_[pos].freq > max_score) 248 | max_score = raw_spellings_[pos].freq; 249 | if (raw_spellings_[pos].freq < min_score) 250 | min_score = raw_spellings_[pos].freq; 251 | } 252 | } 253 | } 254 | 255 | if (kPrintDebug0) 256 | printf("-----max psb: %f, min psb: %f\n", max_score, min_score); 257 | 258 | max_score = log(max_score); 259 | min_score = log(min_score); 260 | 261 | if (kPrintDebug0) 262 | printf("-----max log value: %f, min log value: %f\n", 263 | max_score, min_score); 264 | 265 | // The absolute value of min_score is bigger than that of max_score because 266 | // both of them are negative after log function. 267 | score_amplifier_ = 1.0 * 255 / min_score; 268 | 269 | double average_score = 0; 270 | for (size_t pos = 0; pos < spelling_num_; pos++) { 271 | double score = log(raw_spellings_[pos].freq) * score_amplifier_; 272 | assert(score >= 0); 273 | 274 | average_score += score; 275 | 276 | // Because of calculation precision issue, score might be a little bigger 277 | // than 255 after being amplified. 278 | if (score > 255) 279 | score = 255; 280 | char *this_spl_buf = spelling_buf_ + pos * spelling_size_; 281 | this_spl_buf[spelling_size_ - 1] = 282 | static_cast((unsigned char)score); 283 | 284 | if (kPrintDebug0) { 285 | printf("---pos:%zd, %s, psb:%d\n", pos, this_spl_buf, 286 | (unsigned char)this_spl_buf[spelling_size_ -1]); 287 | } 288 | } 289 | average_score /= spelling_num_; 290 | assert(average_score <= 255); 291 | average_score_ = static_cast(average_score); 292 | 293 | if (kPrintDebug0) 294 | printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_, 295 | average_score_); 296 | } 297 | 298 | *item_size = spelling_size_; 299 | *spl_num = spelling_num_; 300 | frozen_ = true; 301 | return spelling_buf_; 302 | } 303 | 304 | float SpellingTable::get_score_amplifier() { 305 | return static_cast(score_amplifier_); 306 | } 307 | 308 | unsigned char SpellingTable::get_average_score() { 309 | return average_score_; 310 | } 311 | 312 | #endif // ___BUILD_MODEL___ 313 | } // namespace ime_pinyin 314 | -------------------------------------------------------------------------------- /pinyin/spellingtable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_SPELLINGTABLE_H__ 18 | #define PINYINIME_INCLUDE_SPELLINGTABLE_H__ 19 | 20 | #include 21 | #include "./dictdef.h" 22 | 23 | namespace ime_pinyin { 24 | 25 | #ifdef ___BUILD_MODEL___ 26 | 27 | const size_t kMaxSpellingSize = kMaxPinyinSize; 28 | 29 | typedef struct { 30 | char str[kMaxSpellingSize + 1]; 31 | double freq; 32 | } RawSpelling, *PRawSpelling; 33 | 34 | // This class is used to store the spelling strings 35 | // The length of the input spelling string should be less or equal to the 36 | // spelling_size_ (set by init_table). If the input string is too long, 37 | // we only keep its first spelling_size_ chars. 38 | class SpellingTable { 39 | private: 40 | static const size_t kNotSupportNum = 3; 41 | static const char kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1]; 42 | 43 | bool need_score_; 44 | 45 | size_t spelling_max_num_; 46 | 47 | RawSpelling *raw_spellings_; 48 | 49 | // Used to store spelling strings. If the spelling table needs to calculate 50 | // score, an extra char after each spelling string is the score. 51 | // An item with a lower score has a higher probability. 52 | char *spelling_buf_; 53 | size_t spelling_size_; 54 | 55 | double total_freq_; 56 | 57 | size_t spelling_num_; 58 | 59 | double score_amplifier_; 60 | 61 | unsigned char average_score_; 62 | 63 | // If frozen is true, put_spelling() and contain() are not allowed to call. 64 | bool frozen_; 65 | 66 | size_t get_hash_pos(const char* spelling_str); 67 | size_t hash_pos_next(size_t hash_pos); 68 | void free_resource(); 69 | public: 70 | SpellingTable(); 71 | ~SpellingTable(); 72 | 73 | // pure_spl_size is the pure maximum spelling string size. For example, 74 | // "zhuang" is the longgest item in Pinyin, so pure_spl_size should be 6. 75 | // spl_max_num is the maximum number of spelling strings to store. 76 | // need_score is used to indicate whether the caller needs to calculate a 77 | // score for each spelling. 78 | bool init_table(size_t pure_spl_size, size_t spl_max_num, bool need_score); 79 | 80 | // Put a spelling string to the table. 81 | // It always returns false if called after arrange() withtout a new 82 | // init_table() operation. 83 | // freq is the spelling's occuring count. 84 | // If the spelling has been in the table, occuring count will accumulated. 85 | bool put_spelling(const char* spelling_str, double spl_count); 86 | 87 | // Test whether a spelling string is in the table. 88 | // It always returns false, when being called after arrange() withtout a new 89 | // init_table() operation. 90 | bool contain(const char* spelling_str); 91 | 92 | // Sort the spelling strings and put them from the begin of the buffer. 93 | // Return the pointer of the sorted spelling strings. 94 | // item_size and spl_num return the item size and number of spelling. 95 | // Because each spelling uses a '\0' as terminator, the returned item_size is 96 | // at least one char longer than the spl_size parameter specified by 97 | // init_table(). If the table is initialized to calculate score, item_size 98 | // will be increased by 1, and current_spl_str[item_size - 1] stores an 99 | // unsinged char score. 100 | // An item with a lower score has a higher probability. 101 | // Do not call put_spelling() and contains() after arrange(). 102 | const char* arrange(size_t *item_size, size_t *spl_num); 103 | 104 | float get_score_amplifier(); 105 | 106 | unsigned char get_average_score(); 107 | }; 108 | #endif // ___BUILD_MODEL___ 109 | } 110 | 111 | #endif // PINYINIME_INCLUDE_SPELLINGTABLE_H__ 112 | -------------------------------------------------------------------------------- /pinyin/spellingtrie.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__ 18 | #define PINYINIME_INCLUDE_SPELLINGTRIE_H__ 19 | 20 | #include 21 | #include 22 | #include "./dictdef.h" 23 | 24 | namespace ime_pinyin { 25 | 26 | static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1; 27 | 28 | // Node used for the trie of spellings 29 | struct SpellingNode { 30 | SpellingNode *first_son; 31 | // The spelling id for each node. If you need more bits to store 32 | // spelling id, please adjust this structure. 33 | uint32 spelling_idx:27; 34 | uint32 num_of_son:5; 35 | char char_this_node; 36 | unsigned char score; 37 | }; 38 | 39 | class SpellingTrie { 40 | private: 41 | static const int kMaxYmNum = 64; 42 | static const size_t kValidSplCharNum = 26; 43 | 44 | static const uint16 kHalfIdShengmuMask = 0x01; 45 | static const uint16 kHalfIdYunmuMask = 0x02; 46 | static const uint16 kHalfIdSzmMask = 0x04; 47 | 48 | // Map from half spelling id to single char. 49 | // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively. 50 | // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ..., 51 | // 28 to 'Z', 29 to 'z'. 52 | // [0] is not used to achieve better efficiency. 53 | static const char kHalfId2Sc_[kFullSplIdStart + 1]; 54 | 55 | static unsigned char char_flags_[]; 56 | static SpellingTrie* instance_; 57 | 58 | // The spelling table 59 | char *spelling_buf_; 60 | 61 | // The size of longest spelling string, includes '\0' and an extra char to 62 | // store score. For example, "zhuang" is the longgest item in Pinyin list, 63 | // so spelling_size_ is 8. 64 | // Structure: The string ended with '\0' + score char. 65 | // An item with a lower score has a higher probability. 66 | size_t spelling_size_; 67 | 68 | // Number of full spelling ids. 69 | size_t spelling_num_; 70 | 71 | float score_amplifier_; 72 | unsigned char average_score_; 73 | 74 | // The Yunmu id list for the spelling ids (for half ids of Shengmu, 75 | // the Yunmu id is 0). 76 | // The length of the list is spelling_num_ + kFullSplIdStart, 77 | // so that spl_ym_ids_[splid] is the Yunmu id of the splid. 78 | uint8 *spl_ym_ids_; 79 | 80 | // The Yunmu table. 81 | // Each Yunmu will be assigned with Yunmu id from 1. 82 | char *ym_buf_; 83 | size_t ym_size_; // The size of longest Yunmu string, '\0'included. 84 | size_t ym_num_; 85 | 86 | // The spelling string just queried 87 | char *splstr_queried_; 88 | 89 | // The spelling string just queried 90 | char16 *splstr16_queried_; 91 | 92 | // The root node of the spelling tree 93 | SpellingNode* root_; 94 | 95 | // If a none qwerty key such as a fnction key like ENTER is given, this node 96 | // will be used to indicate that this is not a QWERTY node. 97 | SpellingNode* dumb_node_; 98 | 99 | // If a splitter key is pressed, this node will be used to indicate that this 100 | // is a splitter key. 101 | SpellingNode* splitter_node_; 102 | 103 | // Used to get the first level sons. 104 | SpellingNode* level1_sons_[kValidSplCharNum]; 105 | 106 | // The full spl_id range for specific half id. 107 | // h2f means half to full. 108 | // A half id can be a ShouZiMu id (id to represent the first char of a full 109 | // spelling, including Shengmu and Yunmu), or id of zh/ch/sh. 110 | // [1..kFullSplIdStart-1] is the arrange of half id. 111 | uint16 h2f_start_[kFullSplIdStart]; 112 | uint16 h2f_num_[kFullSplIdStart]; 113 | 114 | // Map from full id to half id. 115 | uint16 *f2h_; 116 | 117 | #ifdef ___BUILD_MODEL___ 118 | // How many node used to build the trie. 119 | size_t node_num_; 120 | #endif 121 | 122 | SpellingTrie(); 123 | 124 | void free_son_trie(SpellingNode* node); 125 | 126 | // Construct a subtree using a subset of the spelling array (from 127 | // item_star to item_end). 128 | // Member spelliing_buf_ and spelling_size_ should be valid. 129 | // parent is used to update its num_of_son and score. 130 | SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end, 131 | size_t level, SpellingNode *parent); 132 | bool build_f2h(); 133 | 134 | // The caller should guarantee ch >= 'A' && ch <= 'Z' 135 | bool is_shengmu_char(char ch) const; 136 | 137 | // The caller should guarantee ch >= 'A' && ch <= 'Z' 138 | bool is_yunmu_char(char ch) const; 139 | 140 | #ifdef ___BUILD_MODEL___ 141 | // Given a spelling string, return its Yunmu string. 142 | // The caller guaratees spl_str is valid. 143 | const char* get_ym_str(const char *spl_str); 144 | 145 | // Build the Yunmu list, and the mapping relation between the full ids and the 146 | // Yunmu ids. This functin is called after the spelling trie is built. 147 | bool build_ym_info(); 148 | #endif 149 | 150 | friend class SpellingParser; 151 | friend class SmartSplParser; 152 | friend class SmartSplParser2; 153 | 154 | public: 155 | ~SpellingTrie(); 156 | 157 | inline static bool is_valid_spl_char(char ch) { 158 | return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 159 | } 160 | 161 | // The caller guarantees that the two chars are valid spelling chars. 162 | inline static bool is_same_spl_char(char ch1, char ch2) { 163 | return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A'; 164 | } 165 | 166 | // Construct the tree from the input pinyin array 167 | // The given string list should have been sorted. 168 | // score_amplifier is used to convert a possibility value into score. 169 | // average_score is the average_score of all spellings. The dumb node is 170 | // assigned with this score. 171 | bool construct(const char* spelling_arr, size_t item_size, size_t item_num, 172 | float score_amplifier, unsigned char average_score); 173 | 174 | // Test if the given id is a valid spelling id. 175 | // If function returns true, the given splid may be updated like this: 176 | // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is 177 | // first given as a half id 1, but because 'A' is a one-char Yunmu and 178 | // it is a valid id, it needs to updated to its corresponding full id. 179 | bool if_valid_id_update(uint16 *splid) const; 180 | 181 | // Test if the given id is a half id. 182 | bool is_half_id(uint16 splid) const; 183 | 184 | bool is_full_id(uint16 splid) const; 185 | 186 | // Test if the given id is a one-char Yunmu id (obviously, it is also a half 187 | // id), such as 'A', 'E' and 'O'. 188 | bool is_half_id_yunmu(uint16 splid) const; 189 | 190 | // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled. 191 | // For Pinyin, only i/u/v is not a ShouZiMu char. 192 | // The caller should guarantee that ch >= 'A' && ch <= 'Z' 193 | bool is_szm_char(char ch) const; 194 | 195 | // Test If this char is enabled in ShouZiMu mode. 196 | // The caller should guarantee that ch >= 'A' && ch <= 'Z' 197 | bool szm_is_enabled(char ch) const; 198 | 199 | // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling 200 | // to input). 201 | void szm_enable_shm(bool enable); 202 | 203 | // Enable/disable Yunmus in ShouZiMu mode. 204 | void szm_enable_ym(bool enable); 205 | 206 | // Test if this char is enabled in ShouZiMu mode. 207 | // The caller should guarantee ch >= 'A' && ch <= 'Z' 208 | bool is_szm_enabled(char ch) const; 209 | 210 | // Return the number of full ids for the given half id. 211 | uint16 half2full_num(uint16 half_id) const; 212 | 213 | // Return the number of full ids for the given half id, and fill spl_id_start 214 | // to return the first full id. 215 | uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const; 216 | 217 | // Return the corresponding half id for the given full id. 218 | // Not frequently used, low efficient. 219 | // Return 0 if fails. 220 | uint16 full_to_half(uint16 full_id) const; 221 | 222 | // To test whether a half id is compatible with a full id. 223 | // Generally, when half_id == full_to_half(full_id), return true. 224 | // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible 225 | // with a full id like "Zhe". (Fussy mode is not ready). 226 | bool half_full_compatible(uint16 half_id, uint16 full_id) const; 227 | 228 | static const SpellingTrie* get_cpinstance(); 229 | 230 | static SpellingTrie& get_instance(); 231 | 232 | // Save to the file stream 233 | bool save_spl_trie(FILE *fp); 234 | 235 | // Load from the file stream 236 | bool load_spl_trie(FILE *fp); 237 | 238 | // Get the number of spellings 239 | size_t get_spelling_num(); 240 | 241 | // Return the Yunmu id for the given Yunmu string. 242 | // If the string is not valid, return 0; 243 | uint8 get_ym_id(const char* ym_str); 244 | 245 | // Get the readonly Pinyin string for a given spelling id 246 | const char* get_spelling_str(uint16 splid); 247 | 248 | // Get the readonly Pinyin string for a given spelling id 249 | const char16* get_spelling_str16(uint16 splid); 250 | 251 | // Get Pinyin string for a given spelling id. Return the length of the 252 | // string, and fill-in '\0' at the end. 253 | size_t get_spelling_str16(uint16 splid, char16 *splstr16, 254 | size_t splstr16_len); 255 | }; 256 | } 257 | 258 | #endif // PINYINIME_INCLUDE_SPELLINGTRIE_H__ 259 | -------------------------------------------------------------------------------- /pinyin/splparser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "splparser.h" 19 | 20 | namespace ime_pinyin { 21 | 22 | SpellingParser::SpellingParser() { 23 | spl_trie_ = SpellingTrie::get_cpinstance(); 24 | } 25 | 26 | bool SpellingParser::is_valid_to_parse(char ch) { 27 | return SpellingTrie::is_valid_spl_char(ch); 28 | } 29 | 30 | uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, 31 | uint16 spl_idx[], uint16 start_pos[], 32 | uint16 max_size, bool &last_is_pre) { 33 | if (NULL == splstr || 0 == max_size || 0 == str_len) 34 | return 0; 35 | 36 | if (!SpellingTrie::is_valid_spl_char(splstr[0])) 37 | return 0; 38 | 39 | last_is_pre = false; 40 | 41 | const SpellingNode *node_this = spl_trie_->root_; 42 | 43 | uint16 str_pos = 0; 44 | uint16 idx_num = 0; 45 | if (NULL != start_pos) 46 | start_pos[0] = 0; 47 | bool last_is_splitter = false; 48 | 49 | while (str_pos < str_len) { 50 | char char_this = splstr[str_pos]; 51 | // all characters outside of [a, z] are considered as splitters 52 | if (!SpellingTrie::is_valid_spl_char(char_this)) { 53 | // test if the current node is endable 54 | uint16 id_this = node_this->spelling_idx; 55 | if (spl_trie_->if_valid_id_update(&id_this)) { 56 | spl_idx[idx_num] = id_this; 57 | 58 | idx_num++; 59 | str_pos++; 60 | if (NULL != start_pos) 61 | start_pos[idx_num] = str_pos; 62 | if (idx_num >= max_size) 63 | return idx_num; 64 | 65 | node_this = spl_trie_->root_; 66 | last_is_splitter = true; 67 | continue; 68 | } else { 69 | if (last_is_splitter) { 70 | str_pos++; 71 | if (NULL != start_pos) 72 | start_pos[idx_num] = str_pos; 73 | continue; 74 | } else { 75 | return idx_num; 76 | } 77 | } 78 | } 79 | 80 | last_is_splitter = false; 81 | 82 | SpellingNode *found_son = NULL; 83 | 84 | if (0 == str_pos) { 85 | if (char_this >= 'a') 86 | found_son = spl_trie_->level1_sons_[char_this - 'a']; 87 | else 88 | found_son = spl_trie_->level1_sons_[char_this - 'A']; 89 | } else { 90 | SpellingNode *first_son = node_this->first_son; 91 | // Because for Zh/Ch/Sh nodes, they are the last in the buffer and 92 | // frequently used, so we scan from the end. 93 | for (int i = 0; i < node_this->num_of_son; i++) { 94 | SpellingNode *this_son = first_son + i; 95 | if (SpellingTrie::is_same_spl_char( 96 | this_son->char_this_node, char_this)) { 97 | found_son = this_son; 98 | break; 99 | } 100 | } 101 | } 102 | 103 | // found, just move the current node pointer to the the son 104 | if (NULL != found_son) { 105 | node_this = found_son; 106 | } else { 107 | // not found, test if it is endable 108 | uint16 id_this = node_this->spelling_idx; 109 | if (spl_trie_->if_valid_id_update(&id_this)) { 110 | // endable, remember the index 111 | spl_idx[idx_num] = id_this; 112 | 113 | idx_num++; 114 | if (NULL != start_pos) 115 | start_pos[idx_num] = str_pos; 116 | if (idx_num >= max_size) 117 | return idx_num; 118 | node_this = spl_trie_->root_; 119 | continue; 120 | } else { 121 | return idx_num; 122 | } 123 | } 124 | 125 | str_pos++; 126 | } 127 | 128 | uint16 id_this = node_this->spelling_idx; 129 | if (spl_trie_->if_valid_id_update(&id_this)) { 130 | // endable, remember the index 131 | spl_idx[idx_num] = id_this; 132 | 133 | idx_num++; 134 | if (NULL != start_pos) 135 | start_pos[idx_num] = str_pos; 136 | } 137 | 138 | last_is_pre = !last_is_splitter; 139 | 140 | return idx_num; 141 | } 142 | 143 | uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, 144 | uint16 spl_idx[], uint16 start_pos[], 145 | uint16 max_size, bool &last_is_pre) { 146 | uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 147 | max_size, last_is_pre); 148 | for (uint16 pos = 0; pos < idx_num; pos++) { 149 | if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { 150 | spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); 151 | if (pos == idx_num - 1) { 152 | last_is_pre = false; 153 | } 154 | } 155 | } 156 | return idx_num; 157 | } 158 | 159 | uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, 160 | uint16 spl_idx[], uint16 start_pos[], 161 | uint16 max_size, bool &last_is_pre) { 162 | if (NULL == splstr || 0 == max_size || 0 == str_len) 163 | return 0; 164 | 165 | if (!SpellingTrie::is_valid_spl_char(splstr[0])) 166 | return 0; 167 | 168 | last_is_pre = false; 169 | 170 | const SpellingNode *node_this = spl_trie_->root_; 171 | 172 | uint16 str_pos = 0; 173 | uint16 idx_num = 0; 174 | if (NULL != start_pos) 175 | start_pos[0] = 0; 176 | bool last_is_splitter = false; 177 | 178 | while (str_pos < str_len) { 179 | char16 char_this = splstr[str_pos]; 180 | // all characters outside of [a, z] are considered as splitters 181 | if (!SpellingTrie::is_valid_spl_char(char_this)) { 182 | // test if the current node is endable 183 | uint16 id_this = node_this->spelling_idx; 184 | if (spl_trie_->if_valid_id_update(&id_this)) { 185 | spl_idx[idx_num] = id_this; 186 | 187 | idx_num++; 188 | str_pos++; 189 | if (NULL != start_pos) 190 | start_pos[idx_num] = str_pos; 191 | if (idx_num >= max_size) 192 | return idx_num; 193 | 194 | node_this = spl_trie_->root_; 195 | last_is_splitter = true; 196 | continue; 197 | } else { 198 | if (last_is_splitter) { 199 | str_pos++; 200 | if (NULL != start_pos) 201 | start_pos[idx_num] = str_pos; 202 | continue; 203 | } else { 204 | return idx_num; 205 | } 206 | } 207 | } 208 | 209 | last_is_splitter = false; 210 | 211 | SpellingNode *found_son = NULL; 212 | 213 | if (0 == str_pos) { 214 | if (char_this >= 'a') 215 | found_son = spl_trie_->level1_sons_[char_this - 'a']; 216 | else 217 | found_son = spl_trie_->level1_sons_[char_this - 'A']; 218 | } else { 219 | SpellingNode *first_son = node_this->first_son; 220 | // Because for Zh/Ch/Sh nodes, they are the last in the buffer and 221 | // frequently used, so we scan from the end. 222 | for (int i = 0; i < node_this->num_of_son; i++) { 223 | SpellingNode *this_son = first_son + i; 224 | if (SpellingTrie::is_same_spl_char( 225 | this_son->char_this_node, char_this)) { 226 | found_son = this_son; 227 | break; 228 | } 229 | } 230 | } 231 | 232 | // found, just move the current node pointer to the the son 233 | if (NULL != found_son) { 234 | node_this = found_son; 235 | } else { 236 | // not found, test if it is endable 237 | uint16 id_this = node_this->spelling_idx; 238 | if (spl_trie_->if_valid_id_update(&id_this)) { 239 | // endable, remember the index 240 | spl_idx[idx_num] = id_this; 241 | 242 | idx_num++; 243 | if (NULL != start_pos) 244 | start_pos[idx_num] = str_pos; 245 | if (idx_num >= max_size) 246 | return idx_num; 247 | node_this = spl_trie_->root_; 248 | continue; 249 | } else { 250 | return idx_num; 251 | } 252 | } 253 | 254 | str_pos++; 255 | } 256 | 257 | uint16 id_this = node_this->spelling_idx; 258 | if (spl_trie_->if_valid_id_update(&id_this)) { 259 | // endable, remember the index 260 | spl_idx[idx_num] = id_this; 261 | 262 | idx_num++; 263 | if (NULL != start_pos) 264 | start_pos[idx_num] = str_pos; 265 | } 266 | 267 | last_is_pre = !last_is_splitter; 268 | 269 | return idx_num; 270 | } 271 | 272 | uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, 273 | uint16 spl_idx[], uint16 start_pos[], 274 | uint16 max_size, bool &last_is_pre) { 275 | uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, 276 | max_size, last_is_pre); 277 | for (uint16 pos = 0; pos < idx_num; pos++) { 278 | if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { 279 | spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); 280 | if (pos == idx_num - 1) { 281 | last_is_pre = false; 282 | } 283 | } 284 | } 285 | return idx_num; 286 | } 287 | 288 | uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, 289 | bool *is_pre) { 290 | if (NULL == is_pre) 291 | return 0; 292 | 293 | uint16 spl_idx[2]; 294 | uint16 start_pos[3]; 295 | 296 | if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) 297 | return 0; 298 | 299 | if (start_pos[1] != str_len) 300 | return 0; 301 | return spl_idx[0]; 302 | } 303 | 304 | uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, 305 | bool *is_pre) { 306 | if (NULL == is_pre) 307 | return 0; 308 | 309 | uint16 spl_idx[2]; 310 | uint16 start_pos[3]; 311 | 312 | if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) 313 | return 0; 314 | 315 | if (start_pos[1] != str_len) 316 | return 0; 317 | if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { 318 | spl_trie_->half_to_full(spl_idx[0], spl_idx); 319 | *is_pre = false; 320 | } 321 | 322 | return spl_idx[0]; 323 | } 324 | 325 | uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, 326 | uint16 splidx[], uint16 max_size, 327 | uint16 &full_id_num, bool &is_pre) { 328 | if (max_size <= 0 || !is_valid_to_parse(splstr[0])) 329 | return 0; 330 | 331 | splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); 332 | full_id_num = 0; 333 | if (0 != splidx[0]) { 334 | if (splidx[0] >= kFullSplIdStart) 335 | full_id_num = 1; 336 | return 1; 337 | } 338 | return 0; 339 | } 340 | 341 | } // namespace ime_pinyin 342 | -------------------------------------------------------------------------------- /pinyin/splparser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_SPLPARSER_H__ 18 | #define PINYINIME_INCLUDE_SPLPARSER_H__ 19 | 20 | #include "./dictdef.h" 21 | #include "./spellingtrie.h" 22 | 23 | namespace ime_pinyin { 24 | 25 | class SpellingParser { 26 | protected: 27 | const SpellingTrie *spl_trie_; 28 | 29 | public: 30 | SpellingParser(); 31 | 32 | // Given a string, parse it into a spelling id stream. 33 | // If the whole string are sucessfully parsed, last_is_pre will be true; 34 | // if the whole string is not fullly parsed, last_is_pre will return whether 35 | // the last part of the string is a prefix of a full spelling string. For 36 | // example, given string "zhengzhon", "zhon" is not a valid speling, but it is 37 | // the prefix of "zhong". 38 | // 39 | // If splstr starts with a character not in ['a'-z'] (it is a split char), 40 | // return 0. 41 | // Split char can only appear in the middle of the string or at the end. 42 | uint16 splstr_to_idxs(const char *splstr, uint16 str_len, uint16 splidx[], 43 | uint16 start_pos[], uint16 max_size, bool &last_is_pre); 44 | 45 | // Similar to splstr_to_idxs(), the only difference is that splstr_to_idxs() 46 | // convert single-character Yunmus into half ids, while this function converts 47 | // them into full ids. 48 | uint16 splstr_to_idxs_f(const char *splstr, uint16 str_len, uint16 splidx[], 49 | uint16 start_pos[], uint16 max_size, bool &last_is_pre); 50 | 51 | // Similar to splstr_to_idxs(), the only difference is that this function 52 | // uses char16 instead of char8. 53 | uint16 splstr16_to_idxs(const char16 *splstr, uint16 str_len, uint16 splidx[], 54 | uint16 start_pos[], uint16 max_size, bool &last_is_pre); 55 | 56 | // Similar to splstr_to_idxs_f(), the only difference is that this function 57 | // uses char16 instead of char8. 58 | uint16 splstr16_to_idxs_f(const char16 *splstr16, uint16 str_len, 59 | uint16 splidx[], uint16 start_pos[], 60 | uint16 max_size, bool &last_is_pre); 61 | 62 | // If the given string is a spelling, return the id, others, return 0. 63 | // If the give string is a single char Yunmus like "A", and the char is 64 | // enabled in ShouZiMu mode, the returned spelling id will be a half id. 65 | // When the returned spelling id is a half id, *is_pre returns whether it 66 | // is a prefix of a full spelling string. 67 | uint16 get_splid_by_str(const char *splstr, uint16 str_len, bool *is_pre); 68 | 69 | // If the given string is a spelling, return the id, others, return 0. 70 | // If the give string is a single char Yunmus like "a", no matter the char 71 | // is enabled in ShouZiMu mode or not, the returned spelling id will be 72 | // a full id. 73 | // When the returned spelling id is a half id, *p_is_pre returns whether it 74 | // is a prefix of a full spelling string. 75 | uint16 get_splid_by_str_f(const char *splstr, uint16 str_len, bool *is_pre); 76 | 77 | // Splitter chars are not included. 78 | bool is_valid_to_parse(char ch); 79 | 80 | // When auto-correction is not enabled, get_splid_by_str() will be called to 81 | // return the single result. When auto-correction is enabled, this function 82 | // will be called to get the results. Auto-correction is not ready. 83 | // full_id_num returns number of full spelling ids. 84 | // is_pre returns whether the given string is the prefix of a full spelling 85 | // string. 86 | // If splstr starts with a character not in [a-zA-Z] (it is a split char), 87 | // return 0. 88 | // Split char can only appear in the middle of the string or at the end. 89 | // The caller should guarantee NULL != splstr && str_len > 0 && NULL != splidx 90 | uint16 get_splids_parallel(const char *splstr, uint16 str_len, 91 | uint16 splidx[], uint16 max_size, 92 | uint16 &full_id_num, bool &is_pre); 93 | }; 94 | } 95 | 96 | #endif // PINYINIME_INCLUDE_SPLPARSER_H__ 97 | -------------------------------------------------------------------------------- /pinyin/sync.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "sync.h" 18 | #include 19 | #include 20 | 21 | #ifdef ___SYNC_ENABLED___ 22 | 23 | namespace ime_pinyin { 24 | 25 | Sync::Sync() 26 | : userdict_(NULL), 27 | dictfile_(NULL), 28 | last_count_(0) { 29 | }; 30 | 31 | Sync::~Sync() { 32 | } 33 | 34 | 35 | bool Sync::begin(const char * filename) { 36 | if (userdict_) { 37 | finish(); 38 | } 39 | 40 | if (!filename) { 41 | return false; 42 | } 43 | 44 | dictfile_ = strdup(filename); 45 | if (!dictfile_) { 46 | return false; 47 | } 48 | 49 | userdict_ = new UserDict(); 50 | if (!userdict_) { 51 | free(dictfile_); 52 | dictfile_ = NULL; 53 | return false; 54 | } 55 | 56 | if (userdict_->load_dict((const char*)dictfile_, kUserDictIdStart, 57 | kUserDictIdEnd) == false) { 58 | delete userdict_; 59 | userdict_ = NULL; 60 | free(dictfile_); 61 | dictfile_ = NULL; 62 | return false; 63 | } 64 | 65 | userdict_->set_limit(kUserDictMaxLemmaCount, kUserDictMaxLemmaSize, kUserDictRatio); 66 | 67 | return true; 68 | } 69 | 70 | int Sync::put_lemmas(char16 * lemmas, int len) { 71 | return userdict_->put_lemmas_no_sync_from_utf16le_string(lemmas, len); 72 | } 73 | 74 | int Sync::get_lemmas(char16 * str, int size) { 75 | return userdict_->get_sync_lemmas_in_utf16le_string_from_beginning(str, size, &last_count_); 76 | } 77 | 78 | int Sync::get_last_got_count() { 79 | return last_count_; 80 | } 81 | 82 | int Sync::get_total_count() { 83 | return userdict_->get_sync_count(); 84 | } 85 | 86 | void Sync::clear_last_got() { 87 | if (last_count_ < 0) { 88 | return; 89 | } 90 | userdict_->clear_sync_lemmas(0, last_count_); 91 | last_count_ = 0; 92 | } 93 | 94 | void Sync::finish() { 95 | if (userdict_) { 96 | userdict_->close_dict(); 97 | delete userdict_; 98 | userdict_ = NULL; 99 | free(dictfile_); 100 | dictfile_ = NULL; 101 | last_count_ = 0; 102 | } 103 | } 104 | 105 | int Sync::get_capacity() { 106 | UserDict::UserDictStat stat; 107 | userdict_->state(&stat); 108 | return stat.limit_lemma_count - stat.lemma_count; 109 | } 110 | 111 | } 112 | #endif 113 | -------------------------------------------------------------------------------- /pinyin/sync.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_SYNC_H__ 18 | #define PINYINIME_INCLUDE_SYNC_H__ 19 | 20 | #define ___SYNC_ENABLED___ 21 | 22 | #ifdef ___SYNC_ENABLED___ 23 | 24 | #include "userdict.h" 25 | 26 | namespace ime_pinyin { 27 | 28 | // Class for user dictionary synchronization 29 | // This class is not thread safe 30 | // Normal invoking flow will be 31 | // begin() -> 32 | // put_lemmas() x N -> 33 | // { 34 | // get_lemmas() -> 35 | // [ get_last_got_count() ] -> 36 | // clear_last_got() -> 37 | // } x N -> 38 | // finish() 39 | class Sync { 40 | public: 41 | Sync(); 42 | ~Sync(); 43 | 44 | static const int kUserDictMaxLemmaCount = 5000; 45 | static const int kUserDictMaxLemmaSize = 200000; 46 | static const int kUserDictRatio = 20; 47 | 48 | bool begin(const char * filename); 49 | 50 | // Merge lemmas downloaded from sync server into local dictionary 51 | // lemmas, lemmas string encoded in UTF16LE 52 | // len, length of lemmas string 53 | // Return how many lemmas merged successfully 54 | int put_lemmas(char16 * lemmas, int len); 55 | 56 | // Get local new user lemmas into UTF16LE string 57 | // str, buffer ptr to store new user lemmas 58 | // size, size of buffer 59 | // Return length of returned buffer in measure of UTF16LE 60 | int get_lemmas(char16 * str, int size); 61 | 62 | // Return lemmas count in last get_lemmas() 63 | int get_last_got_count(); 64 | 65 | // Return total lemmas count need get_lemmas() 66 | int get_total_count(); 67 | 68 | // Clear lemmas got by recent get_lemmas() 69 | void clear_last_got(); 70 | 71 | void finish(); 72 | 73 | int get_capacity(); 74 | 75 | private: 76 | UserDict * userdict_; 77 | char * dictfile_; 78 | int last_count_; 79 | }; 80 | 81 | } 82 | 83 | #endif 84 | 85 | #endif // PINYINIME_INCLUDE_SYNC_H__ 86 | -------------------------------------------------------------------------------- /pinyin/userdict.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_USERDICT_H__ 18 | #define PINYINIME_INCLUDE_USERDICT_H__ 19 | 20 | #define ___CACHE_ENABLED___ 21 | #define ___SYNC_ENABLED___ 22 | #define ___PREDICT_ENABLED___ 23 | 24 | // Debug performance for operations 25 | // #define ___DEBUG_PERF___ 26 | 27 | #include 28 | #include 29 | #include "atomdictbase.h" 30 | 31 | namespace ime_pinyin { 32 | 33 | class UserDict : public AtomDictBase { 34 | public: 35 | UserDict(); 36 | ~UserDict(); 37 | 38 | bool load_dict(const char *file_name, LemmaIdType start_id, 39 | LemmaIdType end_id); 40 | 41 | bool close_dict(); 42 | 43 | size_t number_of_lemmas(); 44 | 45 | void reset_milestones(uint16 from_step, MileStoneHandle from_handle); 46 | 47 | MileStoneHandle extend_dict(MileStoneHandle from_handle, 48 | const DictExtPara *dep, LmaPsbItem *lpi_items, 49 | size_t lpi_max, size_t *lpi_num); 50 | 51 | size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 52 | LmaPsbItem *lpi_items, size_t lpi_max); 53 | 54 | uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, 55 | uint16 str_max); 56 | 57 | uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 58 | uint16 splids_max, bool arg_valid); 59 | 60 | size_t predict(const char16 last_hzs[], uint16 hzs_len, 61 | NPredictItem *npre_items, size_t npre_max, 62 | size_t b4_used); 63 | 64 | // Full spelling ids are required 65 | LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], 66 | uint16 lemma_len, uint16 count); 67 | 68 | LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, 69 | bool selected); 70 | 71 | LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], 72 | uint16 lemma_len); 73 | 74 | LmaScoreType get_lemma_score(LemmaIdType lemma_id); 75 | 76 | LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], 77 | uint16 lemma_len); 78 | 79 | bool remove_lemma(LemmaIdType lemma_id); 80 | 81 | size_t get_total_lemma_count(); 82 | void set_total_lemma_count_of_others(size_t count); 83 | 84 | void flush_cache(); 85 | 86 | void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, 87 | uint32 reclaim_ratio); 88 | 89 | void reclaim(); 90 | 91 | void defragment(); 92 | 93 | #ifdef ___SYNC_ENABLED___ 94 | void clear_sync_lemmas(unsigned int start, unsigned int end); 95 | 96 | int get_sync_count(); 97 | 98 | LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], 99 | uint16 lemma_len, uint16 count, uint64 lmt); 100 | /** 101 | * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. 102 | * 103 | * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' 104 | * @param len length of lemmas string in UTF-16LE 105 | * @return newly added lemma count 106 | */ 107 | int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); 108 | 109 | /** 110 | * Get lemmas need sync to a UTF-16LE string of above format. 111 | * Note: input buffer (str) must not be too small. If str is too small to 112 | * contain single one lemma, there might be a dead loop. 113 | * 114 | * @param str buffer to write lemmas 115 | * @param size buffer size in UTF-16LE 116 | * @param count output value of lemma returned 117 | * @return UTF-16LE string length 118 | */ 119 | int get_sync_lemmas_in_utf16le_string_from_beginning( 120 | char16 * str, int size, int * count); 121 | 122 | #endif 123 | 124 | struct UserDictStat { 125 | uint32 version; 126 | const char * file_name; 127 | struct timeval load_time; 128 | struct timeval last_update; 129 | uint32 disk_size; 130 | uint32 lemma_count; 131 | uint32 lemma_size; 132 | uint32 delete_count; 133 | uint32 delete_size; 134 | #ifdef ___SYNC_ENABLED___ 135 | uint32 sync_count; 136 | #endif 137 | uint32 reclaim_ratio; 138 | uint32 limit_lemma_count; 139 | uint32 limit_lemma_size; 140 | }; 141 | 142 | bool state(UserDictStat * stat); 143 | 144 | private: 145 | uint32 total_other_nfreq_; 146 | struct timeval load_time_; 147 | LemmaIdType start_id_; 148 | uint32 version_; 149 | uint8 * lemmas_; 150 | 151 | // In-Memory-Only flag for each lemma 152 | static const uint8 kUserDictLemmaFlagRemove = 1; 153 | // Inuse lemmas' offset 154 | uint32 * offsets_; 155 | // Highest bit in offset tells whether corresponding lemma is removed 156 | static const uint32 kUserDictOffsetFlagRemove = (1 << 31); 157 | // Maximum possible for the offset 158 | static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); 159 | // Bit width for last modified time, from 1 to 16 160 | static const uint32 kUserDictLMTBitWidth = 16; 161 | // Granularity for last modified time in second 162 | static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; 163 | // Maximum frequency count 164 | static const uint16 kUserDictMaxFrequency = 0xFFFF; 165 | 166 | #define COARSE_UTC(year, month, day, hour, minute, second) \ 167 | ( \ 168 | (year - 1970) * 365 * 24 * 60 * 60 + \ 169 | (month - 1) * 30 * 24 * 60 * 60 + \ 170 | (day - 1) * 24 * 60 * 60 + \ 171 | (hour - 0) * 60 * 60 + \ 172 | (minute - 0) * 60 + \ 173 | (second - 0) \ 174 | ) 175 | static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); 176 | 177 | // Correspond to offsets_ 178 | uint32 * scores_; 179 | // Following two fields are only valid in memory 180 | uint32 * ids_; 181 | #ifdef ___PREDICT_ENABLED___ 182 | uint32 * predicts_; 183 | #endif 184 | #ifdef ___SYNC_ENABLED___ 185 | uint32 * syncs_; 186 | size_t sync_count_size_; 187 | #endif 188 | uint32 * offsets_by_id_; 189 | 190 | size_t lemma_count_left_; 191 | size_t lemma_size_left_; 192 | 193 | const char * dict_file_; 194 | 195 | // Be sure size is 4xN 196 | struct UserDictInfo { 197 | // When limitation reached, how much percentage will be reclaimed (1 ~ 100) 198 | uint32 reclaim_ratio; 199 | // maximum lemma count, 0 means no limitation 200 | uint32 limit_lemma_count; 201 | // Maximum lemma size, it's different from 202 | // whole disk file size or in-mem dict size 203 | // 0 means no limitation 204 | uint32 limit_lemma_size; 205 | // Total lemma count including deleted and inuse 206 | // Also indicate offsets_ size 207 | uint32 lemma_count; 208 | // Total size of lemmas including used and freed 209 | uint32 lemma_size; 210 | // Freed lemma count 211 | uint32 free_count; 212 | // Freed lemma size in byte 213 | uint32 free_size; 214 | #ifdef ___SYNC_ENABLED___ 215 | uint32 sync_count; 216 | #endif 217 | int32 total_nfreq; 218 | } dict_info_; 219 | 220 | static const uint32 kUserDictVersion = 0x0ABCDEF0; 221 | 222 | static const uint32 kUserDictPreAlloc = 32; 223 | static const uint32 kUserDictAverageNchar = 8; 224 | 225 | enum UserDictState { 226 | // Keep in order 227 | USER_DICT_NONE = 0, 228 | USER_DICT_SYNC, 229 | #ifdef ___SYNC_ENABLED___ 230 | USER_DICT_SYNC_DIRTY, 231 | #endif 232 | USER_DICT_SCORE_DIRTY, 233 | USER_DICT_OFFSET_DIRTY, 234 | USER_DICT_LEMMA_DIRTY, 235 | 236 | USER_DICT_DEFRAGMENTED, 237 | } state_; 238 | 239 | struct UserDictSearchable { 240 | uint16 splids_len; 241 | uint16 splid_start[kMaxLemmaSize]; 242 | uint16 splid_count[kMaxLemmaSize]; 243 | // Compact inital letters for both FuzzyCompareSpellId and cache system 244 | uint32 signature[kMaxLemmaSize / 4]; 245 | }; 246 | 247 | #ifdef ___CACHE_ENABLED___ 248 | enum UserDictCacheType { 249 | USER_DICT_CACHE, 250 | USER_DICT_MISS_CACHE, 251 | }; 252 | 253 | static const int kUserDictCacheSize = 4; 254 | static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; 255 | 256 | struct UserDictMissCache { 257 | uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; 258 | uint16 head, tail; 259 | } miss_caches_[kMaxLemmaSize]; 260 | 261 | struct UserDictCache { 262 | uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; 263 | uint32 offsets[kUserDictCacheSize]; 264 | uint32 lengths[kUserDictCacheSize]; 265 | // Ring buffer 266 | uint16 head, tail; 267 | } caches_[kMaxLemmaSize]; 268 | 269 | void cache_init(); 270 | 271 | void cache_push(UserDictCacheType type, 272 | UserDictSearchable *searchable, 273 | uint32 offset, uint32 length); 274 | 275 | bool cache_hit(UserDictSearchable *searchable, 276 | uint32 *offset, uint32 *length); 277 | 278 | bool load_cache(UserDictSearchable *searchable, 279 | uint32 *offset, uint32 *length); 280 | 281 | void save_cache(UserDictSearchable *searchable, 282 | uint32 offset, uint32 length); 283 | 284 | void reset_cache(); 285 | 286 | bool load_miss_cache(UserDictSearchable *searchable); 287 | 288 | void save_miss_cache(UserDictSearchable *searchable); 289 | 290 | void reset_miss_cache(); 291 | #endif 292 | 293 | LmaScoreType translate_score(int f); 294 | 295 | int extract_score_freq(int raw_score); 296 | 297 | uint64 extract_score_lmt(int raw_score); 298 | 299 | inline int build_score(uint64 lmt, int freq); 300 | 301 | inline int64 utf16le_atoll(uint16 *s, int len); 302 | 303 | inline int utf16le_lltoa(int64 v, uint16 *s, int size); 304 | 305 | LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], 306 | uint16 lemma_len, uint16 count, uint64 lmt); 307 | 308 | size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, 309 | LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); 310 | 311 | int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); 312 | 313 | int _get_lemma_score(LemmaIdType lemma_id); 314 | 315 | int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, 316 | const UserDictSearchable *searchable); 317 | 318 | bool is_prefix_spell_id(const uint16 * fullids, 319 | uint16 fulllen, const UserDictSearchable *searchable); 320 | 321 | uint32 get_dict_file_size(UserDictInfo * info); 322 | 323 | bool reset(const char *file); 324 | 325 | bool validate(const char *file); 326 | 327 | bool load(const char *file, LemmaIdType start_id); 328 | 329 | bool is_valid_state(); 330 | 331 | bool is_valid_lemma_id(LemmaIdType id); 332 | 333 | LemmaIdType get_max_lemma_id(); 334 | 335 | void set_lemma_flag(uint32 offset, uint8 flag); 336 | 337 | char get_lemma_flag(uint32 offset); 338 | 339 | char get_lemma_nchar(uint32 offset); 340 | 341 | uint16 * get_lemma_spell_ids(uint32 offset); 342 | 343 | uint16 * get_lemma_word(uint32 offset); 344 | 345 | // Prepare searchable to fasten locate process 346 | void prepare_locate(UserDictSearchable *searchable, 347 | const uint16 * splids, uint16 len); 348 | 349 | // Compare initial letters only 350 | int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, 351 | const UserDictSearchable *searchable); 352 | 353 | // Compare exactly two spell ids 354 | // First argument must be a full id spell id 355 | bool equal_spell_id(const uint16 * fullids, 356 | uint16 fulllen, const UserDictSearchable *searchable); 357 | 358 | // Find first item by initial letters 359 | int32 locate_first_in_offsets(const UserDictSearchable *searchable); 360 | 361 | LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], 362 | uint16 lemma_len, uint16 count, uint64 lmt); 363 | 364 | // Check if a lemma is in dictionary 365 | int32 locate_in_offsets(char16 lemma_str[], 366 | uint16 splid_str[], uint16 lemma_len); 367 | 368 | bool remove_lemma_by_offset_index(int offset_index); 369 | #ifdef ___PREDICT_ENABLED___ 370 | uint32 locate_where_to_insert_in_predicts(const uint16 * words, 371 | int lemma_len); 372 | 373 | int32 locate_first_in_predicts(const uint16 * words, int lemma_len); 374 | 375 | void remove_lemma_from_predict_list(uint32 offset); 376 | #endif 377 | #ifdef ___SYNC_ENABLED___ 378 | void queue_lemma_for_sync(LemmaIdType id); 379 | 380 | void remove_lemma_from_sync_list(uint32 offset); 381 | 382 | void write_back_sync(int fd); 383 | #endif 384 | void write_back_score(int fd); 385 | void write_back_offset(int fd); 386 | void write_back_lemma(int fd); 387 | void write_back_all(int fd); 388 | void write_back(); 389 | 390 | struct UserDictScoreOffsetPair { 391 | int score; 392 | uint32 offset_index; 393 | }; 394 | 395 | inline void swap(UserDictScoreOffsetPair * sop, int i, int j); 396 | 397 | void shift_down(UserDictScoreOffsetPair * sop, int i, int n); 398 | 399 | // On-disk format for each lemma 400 | // +-------------+ 401 | // | Version (4) | 402 | // +-------------+ 403 | // +-----------+-----------+--------------------+-------------------+ 404 | // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | 405 | // +-----------+-----------+--------------------+-------------------+ 406 | // ... 407 | // +-----------------------+ +-------------+ <---Offset of offset 408 | // | Offset1 by_splids (4) | ... | OffsetN (4) | 409 | // +-----------------------+ +-------------+ 410 | #ifdef ___PREDICT_ENABLED___ 411 | // +----------------------+ +-------------+ 412 | // | Offset1 by_lemma (4) | ... | OffsetN (4) | 413 | // +----------------------+ +-------------+ 414 | #endif 415 | // +------------+ +------------+ 416 | // | Score1 (4) | ... | ScoreN (4) | 417 | // +------------+ +------------+ 418 | #ifdef ___SYNC_ENABLED___ 419 | // +-------------+ +-------------+ 420 | // | NewAdd1 (4) | ... | NewAddN (4) | 421 | // +-------------+ +-------------+ 422 | #endif 423 | // +----------------+ 424 | // | Dict Info (4x) | 425 | // +----------------+ 426 | }; 427 | } 428 | 429 | #endif 430 | -------------------------------------------------------------------------------- /pinyin/utf16char.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "utf16char.h" 19 | 20 | namespace ime_pinyin { 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | char16* utf16_strtok(char16 *utf16_str, size_t *token_size, 27 | char16 **utf16_str_next) { 28 | if (NULL == utf16_str || NULL == token_size || NULL == utf16_str_next) { 29 | return NULL; 30 | } 31 | 32 | // Skip the splitters 33 | size_t pos = 0; 34 | while ((char16)' ' == utf16_str[pos] || (char16)'\n' == utf16_str[pos] 35 | || (char16)'\t' == utf16_str[pos]) 36 | pos++; 37 | 38 | utf16_str += pos; 39 | pos = 0; 40 | 41 | while ((char16)'\0' != utf16_str[pos] && (char16)' ' != utf16_str[pos] 42 | && (char16)'\n' != utf16_str[pos] 43 | && (char16)'\t' != utf16_str[pos]) { 44 | pos++; 45 | } 46 | 47 | char16 *ret_val = utf16_str; 48 | if ((char16)'\0' == utf16_str[pos]) { 49 | *utf16_str_next = NULL; 50 | if (0 == pos) 51 | return NULL; 52 | } else { 53 | *utf16_str_next = utf16_str + pos + 1; 54 | } 55 | 56 | utf16_str[pos] = (char16)'\0'; 57 | *token_size = pos; 58 | 59 | return ret_val; 60 | } 61 | 62 | int utf16_atoi(const char16 *utf16_str) { 63 | if (NULL == utf16_str) 64 | return 0; 65 | 66 | int value = 0; 67 | int sign = 1; 68 | size_t pos = 0; 69 | 70 | if ((char16)'-' == utf16_str[pos]) { 71 | sign = -1; 72 | pos++; 73 | } 74 | 75 | while ((char16)'0' <= utf16_str[pos] && 76 | (char16)'9' >= utf16_str[pos]) { 77 | value = value * 10 + static_cast(utf16_str[pos] - (char16)'0'); 78 | pos++; 79 | } 80 | 81 | return value*sign; 82 | } 83 | 84 | float utf16_atof(const char16 *utf16_str) { 85 | // A temporary implemetation. 86 | char char8[256]; 87 | if (utf16_strlen(utf16_str) >= 256) return 0; 88 | 89 | utf16_strcpy_tochar(char8, utf16_str); 90 | return atof(char8); 91 | } 92 | 93 | size_t utf16_strlen(const char16 *utf16_str) { 94 | if (NULL == utf16_str) 95 | return 0; 96 | 97 | size_t size = 0; 98 | while ((char16)'\0' != utf16_str[size]) 99 | size++; 100 | return size; 101 | } 102 | 103 | int utf16_strcmp(const char16* str1, const char16* str2) { 104 | size_t pos = 0; 105 | while (str1[pos] == str2[pos] && (char16)'\0' != str1[pos]) 106 | pos++; 107 | 108 | return static_cast(str1[pos]) - static_cast(str2[pos]); 109 | } 110 | 111 | int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size) { 112 | size_t pos = 0; 113 | while (pos < size && str1[pos] == str2[pos] && (char16)'\0' != str1[pos]) 114 | pos++; 115 | 116 | if (pos == size) 117 | return 0; 118 | 119 | return static_cast(str1[pos]) - static_cast(str2[pos]); 120 | } 121 | 122 | // we do not consider overlapping 123 | char16* utf16_strcpy(char16 *dst, const char16 *src) { 124 | if (NULL == src || NULL == dst) 125 | return NULL; 126 | 127 | char16* cp = dst; 128 | 129 | while ((char16)'\0' != *src) { 130 | *cp = *src; 131 | cp++; 132 | src++; 133 | } 134 | 135 | *cp = *src; 136 | 137 | return dst; 138 | } 139 | 140 | char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size) { 141 | if (NULL == src || NULL == dst || 0 == size) 142 | return NULL; 143 | 144 | if (src == dst) 145 | return dst; 146 | 147 | char16* cp = dst; 148 | 149 | if (dst < src || (dst > src && dst >= src + size)) { 150 | while (size-- && (*cp++ = *src++)) 151 | ; 152 | } else { 153 | cp += size - 1; 154 | src += size - 1; 155 | while (size-- && (*cp-- == *src--)) 156 | ; 157 | } 158 | return dst; 159 | } 160 | 161 | // We do not handle complicated cases like overlapping, because in this 162 | // codebase, it is not necessary. 163 | char* utf16_strcpy_tochar(char *dst, const char16 *src) { 164 | if (NULL == src || NULL == dst) 165 | return NULL; 166 | 167 | char* cp = dst; 168 | 169 | while ((char16)'\0' != *src) { 170 | *cp = static_cast(*src); 171 | cp++; 172 | src++; 173 | } 174 | *cp = *src; 175 | 176 | return dst; 177 | } 178 | 179 | #ifdef __cplusplus 180 | } 181 | #endif 182 | } // namespace ime_pinyin 183 | -------------------------------------------------------------------------------- /pinyin/utf16char.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_UTF16CHAR_H__ 18 | #define PINYINIME_INCLUDE_UTF16CHAR_H__ 19 | 20 | #include 21 | #include 22 | 23 | namespace ime_pinyin { 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | typedef uint16_t char16; 30 | 31 | // Get a token from utf16_str, 32 | // Returned pointer is a '\0'-terminated utf16 string, or NULL 33 | // *utf16_str_next returns the next part of the string for further tokenizing 34 | char16* utf16_strtok(char16 *utf16_str, size_t *token_size, 35 | char16 **utf16_str_next); 36 | 37 | int utf16_atoi(const char16 *utf16_str); 38 | 39 | float utf16_atof(const char16 *utf16_str); 40 | 41 | size_t utf16_strlen(const char16 *utf16_str); 42 | 43 | int utf16_strcmp(const char16 *str1, const char16 *str2); 44 | int utf16_strncmp(const char16 *str1, const char16 *str2, size_t size); 45 | 46 | char16* utf16_strcpy(char16 *dst, const char16 *src); 47 | char16* utf16_strncpy(char16 *dst, const char16 *src, size_t size); 48 | 49 | 50 | char* utf16_strcpy_tochar(char *dst, const char16 *src); 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | } 56 | 57 | #endif // PINYINIME_INCLUDE_UTF16CHAR_H__ 58 | -------------------------------------------------------------------------------- /pinyin/utf16reader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "utf16reader.h" 18 | 19 | namespace ime_pinyin { 20 | 21 | #define MIN_BUF_LEN 128 22 | #define MAX_BUF_LEN 65535 23 | 24 | Utf16Reader::Utf16Reader() { 25 | fp_ = NULL; 26 | buffer_ = NULL; 27 | buffer_total_len_ = 0; 28 | buffer_next_pos_ = 0; 29 | buffer_valid_len_ = 0; 30 | } 31 | 32 | Utf16Reader::~Utf16Reader() { 33 | if (NULL != fp_) 34 | fclose(fp_); 35 | 36 | if (NULL != buffer_) 37 | delete [] buffer_; 38 | } 39 | 40 | 41 | bool Utf16Reader::open(const char* filename, size_t buffer_len) { 42 | if (filename == NULL) 43 | return false; 44 | 45 | if (buffer_len < MIN_BUF_LEN) 46 | buffer_len = MIN_BUF_LEN; 47 | else if (buffer_len > MAX_BUF_LEN) 48 | buffer_len = MAX_BUF_LEN; 49 | 50 | buffer_total_len_ = buffer_len; 51 | 52 | if (NULL != buffer_) 53 | delete [] buffer_; 54 | buffer_ = new char16[buffer_total_len_]; 55 | if (NULL == buffer_) 56 | return false; 57 | 58 | if ((fp_ = fopen(filename, "rb")) == NULL) 59 | return false; 60 | 61 | // the UTF16 file header, skip 62 | char16 header; 63 | if (fread(&header, sizeof(header), 1, fp_) != 1 || header != 0xfeff) { 64 | fclose(fp_); 65 | fp_ = NULL; 66 | return false; 67 | } 68 | 69 | return true; 70 | } 71 | 72 | char16* Utf16Reader::readline(char16* read_buf, size_t max_len) { 73 | if (NULL == fp_ || NULL == read_buf || 0 == max_len) 74 | return NULL; 75 | 76 | size_t ret_len = 0; 77 | 78 | do { 79 | if (buffer_valid_len_ == 0) { 80 | buffer_next_pos_ = 0; 81 | buffer_valid_len_ = fread(buffer_, sizeof(char16), 82 | buffer_total_len_, fp_); 83 | if (buffer_valid_len_ == 0) { 84 | if (0 == ret_len) 85 | return NULL; 86 | read_buf[ret_len] = (char16)'\0'; 87 | return read_buf; 88 | } 89 | } 90 | 91 | for (size_t i = 0; i < buffer_valid_len_; i++) { 92 | if (i == max_len - 1 || 93 | buffer_[buffer_next_pos_ + i] == (char16)'\n') { 94 | if (ret_len + i > 0 && read_buf[ret_len + i - 1] == (char16)'\r') { 95 | read_buf[ret_len + i - 1] = (char16)'\0'; 96 | } else { 97 | read_buf[ret_len + i] = (char16)'\0'; 98 | } 99 | 100 | i++; 101 | buffer_next_pos_ += i; 102 | buffer_valid_len_ -= i; 103 | if (buffer_next_pos_ == buffer_total_len_) { 104 | buffer_next_pos_ = 0; 105 | buffer_valid_len_ = 0; 106 | } 107 | return read_buf; 108 | } else { 109 | read_buf[ret_len + i] = buffer_[buffer_next_pos_ + i]; 110 | } 111 | } 112 | 113 | ret_len += buffer_valid_len_; 114 | buffer_valid_len_ = 0; 115 | } while (true); 116 | 117 | // Never reach here 118 | return NULL; 119 | } 120 | 121 | bool Utf16Reader::close() { 122 | if (NULL != fp_) 123 | fclose(fp_); 124 | fp_ = NULL; 125 | 126 | if (NULL != buffer_) 127 | delete [] buffer_; 128 | buffer_ = NULL; 129 | return true; 130 | } 131 | } // namespace ime_pinyin 132 | -------------------------------------------------------------------------------- /pinyin/utf16reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 The Android Open Source Project 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef PINYINIME_INCLUDE_UTF16READER_H__ 18 | #define PINYINIME_INCLUDE_UTF16READER_H__ 19 | 20 | #include 21 | #include "./utf16char.h" 22 | 23 | namespace ime_pinyin { 24 | 25 | class Utf16Reader { 26 | private: 27 | FILE *fp_; 28 | char16 *buffer_; 29 | size_t buffer_total_len_; 30 | size_t buffer_next_pos_; 31 | 32 | // Always less than buffer_total_len_ - buffer_next_pos_ 33 | size_t buffer_valid_len_; 34 | 35 | public: 36 | Utf16Reader(); 37 | ~Utf16Reader(); 38 | 39 | // filename is the name of the file to open. 40 | // buffer_len specifies how long buffer should be allocated to speed up the 41 | // future reading 42 | bool open(const char* filename, size_t buffer_len); 43 | char16* readline(char16* read_buf, size_t max_len); 44 | bool close(); 45 | }; 46 | } 47 | 48 | #endif // PINYINIME_INCLUDE_UTF16READER_H__ 49 | -------------------------------------------------------------------------------- /pinyininputmethod.cpp: -------------------------------------------------------------------------------- 1 | #include "pinyininputmethod.h" 2 | #include 3 | #include 4 | #include 5 | #include "pinyinime.h" 6 | using namespace ime_pinyin; 7 | 8 | CPinyinInputMethod::CPinyinInputMethod() 9 | { 10 | HanziModel.clear(); 11 | 12 | QSettings setter(":/dict/pinyinEx.ini", QSettings::IniFormat); 13 | 14 | lstEN = setter.value("pyEn", lstEN).toStringList(); 15 | if (lstEN.size() <= 1) 16 | { 17 | qDebug() << "Can't load pyEn!"; 18 | return; 19 | } 20 | bool ret = im_open_decoder("./dict/dict_pinyin.dat", "./dict/user.dat"); 21 | if (!ret) 22 | { 23 | qDebug() << "open dict faild"; 24 | return; 25 | } 26 | } 27 | 28 | CPinyinInputMethod::~CPinyinInputMethod() 29 | { 30 | lstEN.clear(); 31 | HanziModel.clear(); 32 | im_close_decoder(); 33 | } 34 | 35 | void CPinyinInputMethod::SearchCN(const QString &gemfield) 36 | { 37 | HanziModel.clear(); 38 | if (gemfield == "") 39 | { 40 | emit pinyinTextChanged(""); 41 | return; 42 | } 43 | im_reset_search(); 44 | int num = im_search(gemfield.toLatin1().data(), gemfield.size()); 45 | const uint16 *pos; 46 | int size = im_get_spl_start_pos(pos); 47 | QString py = gemfield; 48 | for (int i = size-1; i > 0; i--) 49 | py.insert(pos[i], "'"); 50 | py.replace("''", "'"); 51 | emit pinyinTextChanged(py); 52 | if (num > 50) num = 50; 53 | char16 buf[20]={0}; 54 | for (int i = 0; i < num; i++) 55 | { 56 | im_get_candidate(i, buf, 20); 57 | HanziModel << QString::fromUtf16(buf); 58 | } 59 | } 60 | 61 | void CPinyinInputMethod::BinarySearchEN(const QString &gemfield) 62 | { 63 | emit pinyinTextChanged(gemfield); 64 | int min = 0; 65 | int max = lstEN.size(); 66 | int idx = max / 2; 67 | 68 | while (true) 69 | { 70 | if (lstEN[idx].startsWith(gemfield, Qt::CaseInsensitive)) 71 | break; 72 | if (max == min + 1 || max <= min || max == idx + 1 || max == idx || min == idx + 1 || min == idx ) 73 | break; 74 | if (lstEN[idx].toLower() > gemfield) 75 | max = idx; 76 | else 77 | min = idx; 78 | idx = (max + min) / 2; 79 | } 80 | do{ 81 | if (--idx < 0) 82 | break; 83 | }while(lstEN[idx].startsWith(gemfield, Qt::CaseInsensitive)); 84 | 85 | ++idx; 86 | int cnt = 0; 87 | while(++cnt < 50) 88 | { 89 | if (idx >= lstEN.size()) 90 | break; 91 | if (lstEN[idx].startsWith(gemfield, Qt::CaseInsensitive)) 92 | HanziModel.append(lstEN[idx]); 93 | else 94 | break; 95 | idx++; 96 | } 97 | } 98 | 99 | void CPinyinInputMethod::Matching(const QString &gemfield, bool isEn) 100 | { 101 | HanziModel.clear(); 102 | if (!isEn) SearchCN(gemfield); 103 | else BinarySearchEN(gemfield); 104 | } 105 | -------------------------------------------------------------------------------- /pinyininputmethod.h: -------------------------------------------------------------------------------- 1 | #ifndef PINYININPUTMETHOD_H 2 | #define PINYININPUTMETHOD_H 3 | 4 | #include 5 | #include 6 | 7 | class CPinyinInputMethod: public QObject 8 | { 9 | Q_OBJECT 10 | public: 11 | CPinyinInputMethod(); 12 | ~CPinyinInputMethod(); 13 | 14 | signals: 15 | void pinyinTextChanged(const QString&); 16 | 17 | public: 18 | void SearchCN(const QString &gemfield); 19 | void BinarySearchEN(const QString &gemfield); 20 | void Matching(const QString &gemfield, bool isEn = true); 21 | QStringList HanziModel; 22 | 23 | private: 24 | QStringList lstEN; 25 | }; 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /platforminputcontextplugin.cpp: -------------------------------------------------------------------------------- 1 | #include "platforminputcontextplugin.h" 2 | #include "inputcontext.h" 3 | 4 | QPlatformInputContext *PlatformInputContextPlugin::create(const QString& system, const QStringList& paramList) 5 | { 6 | Q_UNUSED(paramList); 7 | if (system.compare(system, QStringLiteral("Qt5Input"), Qt::CaseInsensitive) == 0) 8 | return new InputContext; 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /platforminputcontextplugin.h: -------------------------------------------------------------------------------- 1 | #ifndef MOCKUPPLATFORMINPUTCONTEXTPLUGIN_H 2 | #define MOCKUPPLATFORMINPUTCONTEXTPLUGIN_H 3 | 4 | #include "virtualkeyboard_global.h" 5 | #include 6 | 7 | class VIRTUALKEYBOARDSHARED_EXPORT PlatformInputContextPlugin: public QPlatformInputContextPlugin 8 | { 9 | Q_OBJECT 10 | Q_PLUGIN_METADATA(IID "org.qt-project.Qt.QPlatformInputContextFactoryInterface" FILE "Qt5Input.json") 11 | 12 | public: 13 | QPlatformInputContext *create(const QString&, const QStringList&); 14 | }; 15 | 16 | #endif // MOCKUPPLATFORMINPUTCONTEXTPLUGIN_H 17 | -------------------------------------------------------------------------------- /res.qrc: -------------------------------------------------------------------------------- 1 | 2 | 3 | images/btn1.png 4 | images/btn2.png 5 | images/btn3.png 6 | images/change.png 7 | font/FontAwesome.otf 8 | dict/pinyinEx.ini 9 | dict/dict_pinyin.dat 10 | 11 | 12 | -------------------------------------------------------------------------------- /virtualkeyboard_global.h: -------------------------------------------------------------------------------- 1 | #ifndef VIRTUALKEYBOARD_GLOBAL_H 2 | #define VIRTUALKEYBOARD_GLOBAL_H 3 | 4 | #include 5 | 6 | #if defined(VIRTUALKEYBOARD_LIBRARY) 7 | # define VIRTUALKEYBOARDSHARED_EXPORT Q_DECL_EXPORT 8 | #else 9 | # define VIRTUALKEYBOARDSHARED_EXPORT Q_DECL_IMPORT 10 | #endif 11 | 12 | #endif // VIRTUALKEYBOARD_GLOBAL_H 13 | --------------------------------------------------------------------------------