├── README ├── __init__.py ├── res ├── iris.data └── names.txt ├── src ├── __init__.py ├── alg.py ├── aot.py ├── config.py ├── convert-rnc.py ├── create-lexicon.py ├── dep.py ├── liblinear.patch ├── ml │ ├── __init__.py │ ├── nb.py │ ├── nn.py │ └── svm.py ├── morph.py ├── mstparser.py ├── parsers │ ├── __init__.py │ └── cyk.py ├── pos.py ├── rnc.py ├── sentiment │ ├── __init__.py │ ├── demo.html │ ├── demo.py │ ├── download-kinopoisk.py │ ├── index.py │ ├── public │ │ ├── main.css │ │ └── reset5.css │ ├── test.py │ ├── train.py │ └── validate.py ├── syntagrus.py ├── template.py └── train.py ├── test ├── pos-test.py ├── test-alg.py ├── test-cyk.py ├── test-iris.py ├── test-names.py ├── test-nn.py └── test-polarity.py ├── tmp ├── ids.pickle └── svm.model └── web ├── .server.py.swp ├── html ├── .tagging.html.swp └── tagging.html ├── public ├── .htaccess ├── css │ ├── html5.js │ ├── main.css │ └── reset5.css ├── favicon.ico └── js │ ├── .d3-tree-test.js.swp │ ├── Curry-1.0.1.js │ ├── d3-tree-test.js │ ├── d3.v2.min.js │ ├── dracula_algorithms.js │ ├── dracula_graffle.js │ ├── dracula_graph.js │ ├── html5.js │ ├── jquery-1.4.2.min.js │ ├── raphael-min.js │ ├── seedrandom.js │ └── sigma.min.js └── server.py /README: -------------------------------------------------------------------------------- 1 | Pyrus is a project of analyzing and parsing of Russian language in Python 3. 2 | It is not to replace NLTK, but rather for practice. 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irokez/Pyrus/9a5c64991592ca13e0a5726f394fbe2a399501c3/__init__.py -------------------------------------------------------------------------------- /res/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | 152 | -------------------------------------------------------------------------------- /res/names.txt: -------------------------------------------------------------------------------- 1 | Оксана f 2 | Альберт m 3 | Лазарь m 4 | Христоф m 5 | Рунар m 6 | Феофан m 7 | Джой m 8 | Христина f 9 | Серафим m 10 | Майслав m 11 | Зигмунд m 12 | Антуан m 13 | Климент m 14 | Фрол m 15 | Викторина f 16 | Фернан m 17 | Фаина f 18 | Юджин m 19 | Серафима f 20 | Чеслав m 21 | Арсения f 22 | Аврор m 23 | Эрик m 24 | Самсон m 25 | Рюрик m 26 | Святослава f 27 | Виринея f 28 | Казимир m 29 | Кира f 30 | Вильям m 31 | Габриель m 32 | Марин m 33 | Торез m 34 | Ираклия f 35 | Фелиция f 36 | Нада f 37 | Гавриил m 38 | Милада f 39 | Евгения f 40 | Иосиф m 41 | Иоанн m 42 | Феодосий m 43 | Исей m 44 | Назар m 45 | Вилиор m 46 | Анисим m 47 | Лель m 48 | Нил m 49 | Филат m 50 | Прасковья f 51 | Архип m 52 | Светослав m 53 | Ратмир m 54 | Равель m 55 | Владимира f 56 | Нелли f 57 | Юлиан m 58 | Будимир m 59 | Вера f 60 | Оливия f 61 | Веселина f 62 | Слава m 63 | Володар m 64 | Адольф m 65 | Любим m 66 | Винцент m 67 | Лионель m 68 | Евлогий m 69 | Харита f 70 | Ксенофонт m 71 | Любава f 72 | Аглаида f 73 | Андрей m 74 | Кондратий m 75 | Дана f 76 | Аполлинария f 77 | Равиль m 78 | Эмилий m 79 | Флоренция f 80 | Неон m 81 | Дарья f 82 | Аскольд m 83 | Савва m 84 | Ганна f 85 | Арина f 86 | Юлиана f 87 | Аврелия f 88 | Зиновий m 89 | Леонида f 90 | Матвей m 91 | Гораций m 92 | Константин m 93 | Дарина f 94 | Эвальд m 95 | Стефания f 96 | Мир m 97 | Святополк m 98 | Юдифь f 99 | Августин m 100 | Марат m 101 | Лаэрт m 102 | Антип m 103 | Алёна f 104 | Мира f 105 | Лина f 106 | Феоктиста f 107 | Свет m 108 | Ювеналий m 109 | Сюзанна f 110 | Юлия f 111 | Аргент m 112 | Рубен m 113 | Мальвина f 114 | Ромуальд m 115 | Олимпий m 116 | Мелания f 117 | Воислав m 118 | Ян m 119 | Венедикт m 120 | Мирослава f 121 | Касьян m 122 | Юстин m 123 | Тигран m 124 | Еруслан m 125 | Рем m 126 | Вилена f 127 | Зарина f 128 | Клеопатра f 129 | Таисия f 130 | Эльмир m 131 | Яромир m 132 | Мария f 133 | Октавиан m 134 | Ричард m 135 | Светлана f 136 | Арий m 137 | Парамон m 138 | Рид m 139 | Прохор m 140 | Донара f 141 | Серапион m 142 | Мариан m 143 | Анатолия f 144 | Авангард m 145 | Мстислав m 146 | Роальд m 147 | Герт m 148 | Франц m 149 | Павлина f 150 | Лукерия f 151 | Павел m 152 | Лилиана f 153 | Луиза f 154 | Евгений m 155 | Яна f 156 | Вячеслав m 157 | Февронья f 158 | Тихомир m 159 | Майя f 160 | Ангелина f 161 | Георгий m 162 | Анастасий m 163 | Милен m 164 | Карен m 165 | Магдалина f 166 | Ренат m 167 | Юлий m 168 | Норд m 169 | Филимон m 170 | Валерия f 171 | Мирон m 172 | Прозор m 173 | Исидор m 174 | Лоренс m 175 | Наль m 176 | Тельнан m 177 | Джозеф m 178 | Дидим m 179 | Владлена f 180 | Аксинья f 181 | Мариетта f 182 | Магда f 183 | Арнольд m 184 | Орест m 185 | Регина f 186 | Илиана f 187 | Прокофий m 188 | Агапия f 189 | Анатолий m 190 | Ибрагим m 191 | Евграф m 192 | Ада f 193 | Оливер m 194 | Ананий m 195 | Кристина f 196 | Иона f 197 | Ревмир m 198 | Виолен m 199 | Наталья f 200 | Сильва f 201 | Горн m 202 | Гайя f 203 | Мефодий m 204 | Тихон m 205 | Станислав m 206 | Люсьен m 207 | Яромира f 208 | Сусанна f 209 | Дементий m 210 | Емельян m 211 | Кристоф m 212 | Ревмира f 213 | Никифор m 214 | Венера f 215 | Мари f 216 | Зорий m 217 | Семён m 218 | Евсей m 219 | Руслан m 220 | Чеслава f 221 | Феликсана f 222 | Исай m 223 | Зот m 224 | Антония f 225 | Любомира f 226 | Влада f 227 | Юманита f 228 | Дональт m 229 | Бруно m 230 | Лаврентий m 231 | Лорис m 232 | Илья m 233 | Правдина f 234 | Беатриса f 235 | Болеслав m 236 | Радислав m 237 | Ксения f 238 | Тамила f 239 | Лидия f 240 | Владелин m 241 | Альбин m 242 | Гелла f 243 | Горислава f 244 | Рогнеда f 245 | Феодосия f 246 | Алина f 247 | Юстина f 248 | Эраст m 249 | Феона f 250 | Эсфирь f 251 | Адам m 252 | Руссо m 253 | Савелий m 254 | Макс m 255 | Галактион m 256 | Зоя f 257 | Филадельфия f 258 | Флорентий m 259 | Ярослава f 260 | Монолит m 261 | Беата f 262 | Ярополк m 263 | Сильвест m 264 | Глафира f 265 | Марк m 266 | Злата f 267 | Лениан m 268 | Радамес m 269 | Дар m 270 | Эвелина f 271 | Ипатия f 272 | Трифон m 273 | Фёдор m 274 | Корней m 275 | Горимир m 276 | Назим m 277 | Леонила f 278 | Гертруда f 279 | Илиодор m 280 | Алла f 281 | Искра f 282 | Рафаэль m 283 | Лермонт m 284 | Надир m 285 | Краснослав m 286 | Елизавета f 287 | Вилора f 288 | Поликсена f 289 | Алиса f 290 | Сидор m 291 | Бронислав m 292 | Лукиана f 293 | Васса f 294 | Игнатий m 295 | Февралин m 296 | Товий m 297 | Ленина f 298 | Виталия f 299 | Бела f 300 | Храбр m 301 | Евдокия f 302 | Джон m 303 | Леопольд m 304 | Адриан m 305 | Флавия f 306 | Робеспьер m 307 | Ермил m 308 | Адий m 309 | Торий m 310 | Муза f 311 | Конон m 312 | Борислав m 313 | Роза f 314 | Яков m 315 | Роман m 316 | Симона f 317 | Зинаида f 318 | Варфоломей m 319 | Варвара f 320 | Марта f 321 | Улита f 322 | Эдда f 323 | Альфред m 324 | Марс m 325 | Никандр m 326 | Ростислав m 327 | Джеральд m 328 | Екатерина f 329 | Корнил m 330 | Октябрина f 331 | Велор m 332 | Тальяна f 333 | Авдей m 334 | Клавдий m 335 | Колумбий m 336 | Поликарп m 337 | Эрг m 338 | Евдоким m 339 | Вилорг m 340 | Лукьян m 341 | Елизар m 342 | Гаспар m 343 | Болеслава f 344 | Капитон m 345 | Панфил m 346 | Чарлз m 347 | Пахом m 348 | Элина f 349 | Мирослав m 350 | Вадим m 351 | Софокл m 352 | Пересвет m 353 | Милан m 354 | Леонтия f 355 | Степан m 356 | Тит m 357 | Владислав m 358 | Гаральд m 359 | Федор m 360 | Ипатий m 361 | Авдотья f 362 | Августа f 363 | Вольдемар m 364 | Конкордия f 365 | Милад m 366 | Агап m 367 | Анисия f 368 | Дея f 369 | Лилия f 370 | Максим m 371 | Зара f 372 | Митродора f 373 | Оксар m 374 | Владилена f 375 | Иероним m 376 | Платонида f 377 | Рэм m 378 | Вергилий m 379 | Эльвира f 380 | Артём m 381 | Веста f 382 | Вахтанг m 383 | Гвидон m 384 | Клемент m 385 | Амос m 386 | Надия f 387 | Цезарь m 388 | Декабрий m 389 | Фотий m 390 | Флоренц m 391 | Родион m 392 | Северин m 393 | Вавила m 394 | Лора f 395 | Мурат m 396 | Измаил m 397 | Звенислава f 398 | Евстафий m 399 | Эмилия f 400 | Светозара f 401 | Камиль m 402 | Куприян m 403 | Июлий m 404 | Лука m 405 | Степанида f 406 | Лениана f 407 | Максимильян m 408 | Нельсон m 409 | Мадлен f 410 | Рустем m 411 | Аркадия f 412 | Таир m 413 | Изяслав m 414 | Астрид f 415 | Илий m 416 | Эдмунд m 417 | Бертольд m 418 | Юм m 419 | Арефий m 420 | Отто m 421 | Елена f 422 | Кузьма m 423 | Эрий m 424 | Давыд m 425 | Милия f 426 | Нисон m 427 | Нонна f 428 | Глория f 429 | Боян m 430 | Эмилиан m 431 | Дан m 432 | Геннадий m 433 | Лия f 434 | Арсений m 435 | Лариса f 436 | Владилен m 437 | Меркурий m 438 | Эльдар m 439 | Зигфрид m 440 | Добрыня m 441 | Мирра f 442 | Дина f 443 | Соломон m 444 | Валериан m 445 | Гений m 446 | Викентий m 447 | Ларион m 448 | Софон m 449 | Аристарх m 450 | Ядвига f 451 | Евпраксия f 452 | Воля f 453 | Розалия f 454 | Эдгар m 455 | Арам m 456 | Белла f 457 | Климентий m 458 | Горислав m 459 | Цецилия f 460 | Лор m 461 | Аверкий m 462 | Маврикий m 463 | Севастиан m 464 | Никон m 465 | Аким m 466 | Федора f 467 | Анжела f 468 | Люксен m 469 | Даниар m 470 | Исак m 471 | Арвид m 472 | Пётр m 473 | Евфалия f 474 | Януарий m 475 | Леонид m 476 | Ролан m 477 | Ефим m 478 | Дионисия f 479 | Аэлла f 480 | Гелиан m 481 | Дин m 482 | Манфред m 483 | Омар m 484 | Ульяна f 485 | Еремей m 486 | Левкий m 487 | Олимпия f 488 | Шмидт m 489 | Калина m 490 | Любомир m 491 | Антонин m 492 | Вилор m 493 | Феликс m 494 | Роберт m 495 | Вацлав m 496 | Киприан m 497 | Виталий m 498 | Анимаиса f 499 | Нина f 500 | Туллий m 501 | Северян m 502 | Витольд m 503 | Севастьяна f 504 | Власта f 505 | Жанна f 506 | Боеслав m 507 | Владимир m 508 | Домна f 509 | Вероника f 510 | Эльмар m 511 | Ванадий m 512 | Артемия f 513 | Валерий m 514 | Еликонида f 515 | Нинель f 516 | Зиновия f 517 | Иоанна f 518 | Клариса f 519 | Терентий m 520 | Виссарион m 521 | Пимен m 522 | Капитолина f 523 | Нодар m 524 | Никанор m 525 | Галий m 526 | Эльза f 527 | Влас m 528 | Ева f 529 | Даниил m 530 | Мстислава f 531 | Римма f 532 | Розана f 533 | Софья f 534 | Эдвард m 535 | Радий m 536 | Авенир m 537 | Баграт m 538 | Василий m 539 | Милий m 540 | Даниэль m 541 | Платон m 542 | Юрий m 543 | Леонтий m 544 | Игорь m 545 | Протас m 546 | Акилина f 547 | Евстолия f 548 | Жозеф m 549 | Геронтий m 550 | Николай m 551 | Мариана f 552 | Ефросиния f 553 | Руслана f 554 | Фарид m 555 | Устин m 556 | Ливадий m 557 | Любовь f 558 | Фелицата f 559 | Маргарита f 560 | Святослав m 561 | Виктория f 562 | Вильгельм m 563 | Ярослав m 564 | Марина f 565 | Ждан m 566 | Адель f 567 | Галина f 568 | Млада f 569 | Вартан m 570 | Маркел m 571 | Дамир m 572 | Сократ m 573 | Лукина f 574 | Булат m 575 | Руфина f 576 | Ермолай m 577 | Май m 578 | Рафаил m 579 | Нана f 580 | Партизан m 581 | Саломея f 582 | Натан m 583 | Всеслава f 584 | Густав m 585 | Эдвин m 586 | Аркадий m 587 | Лада f 588 | Гамлет m 589 | Варлам m 590 | Настасья f 591 | Сабина f 592 | Борислава f 593 | Ленар m 594 | Нестор m 595 | Валентин m 596 | Рада f 597 | Агата f 598 | Голуба f 599 | Рустам m 600 | Виола f 601 | Владислава f 602 | Марлен m 603 | Ия f 604 | Анфия f 605 | Анвар m 606 | Эдип m 607 | Нинелла f 608 | Дорофея f 609 | Октавия f 610 | Олеся f 611 | Лавр m 612 | Наина f 613 | Юзефа f 614 | Ироида f 615 | Эдуард m 616 | Домника f 617 | Людмила f 618 | Флорин m 619 | Григорий m 620 | Гурий m 621 | Ольга f 622 | Милана f 623 | Вольфрам m 624 | Алексей m 625 | Сергей m 626 | Ириней m 627 | Епифан m 628 | Горазд m 629 | Алим m 630 | Аполлинарий m 631 | Лир m 632 | Станислава f 633 | Ольгерд m 634 | Борис m 635 | Арсен m 636 | Видана f 637 | Герман m 638 | Егор m 639 | Оскар m 640 | Янина f 641 | Аэлита f 642 | Аида f 643 | Совет m 644 | Илона f 645 | Светозар m 646 | Харитон m 647 | Корнелий m 648 | Рандольф m 649 | Лоэнгрин m 650 | Абрам m 651 | Виктор m 652 | Элеонора f 653 | Анастасия f 654 | Анфиса f 655 | Карина f 656 | Карп m 657 | Ванда f 658 | Сигизмунд m 659 | Ренальд m 660 | Клара f 661 | Нинел m 662 | Тенгиз m 663 | Афанасий m 664 | Карл m 665 | Ида f 666 | Атеист m 667 | Глеб m 668 | Радим m 669 | Азалия f 670 | Осип m 671 | Юлитта f 672 | Геральд m 673 | Бронислава f 674 | Шамиль m 675 | Морис m 676 | Тамара f 677 | Варлен m 678 | Катерина f 679 | Гектор m 680 | Добрава f 681 | Никодим m 682 | Анис m 683 | Юнона f 684 | Ромен m 685 | Гермоген m 686 | Селена f 687 | Альвин m 688 | Олег m 689 | Всеволод m 690 | Эрнест m 691 | Надежда f 692 | Слава f 693 | Гарибальди m 694 | Вилен m 695 | Модест m 696 | Астра f 697 | Татьяна f 698 | Эрика f 699 | Стелла f 700 | Сурен m 701 | Вацлава f 702 | Агнеса f 703 | Вевея f 704 | Матильда f 705 | Маркиан m 706 | Ермиония f 707 | Ий m 708 | Симон m 709 | Эль m 710 | Велислав m 711 | Ким m 712 | Марфа f 713 | Евлалия f 714 | Андрон m 715 | Иван m 716 | Всемил m 717 | Афанасия f 718 | Генрих m 719 | Славяна f 720 | Гелия f 721 | Гайдар m 722 | Берта f 723 | Ипполит m 724 | Фернанд m 725 | Текуса f 726 | Дионисий m 727 | Эльбрус m 728 | Фадей m 729 | Василько m 730 | Светлан m 731 | Виль m 732 | Ефимия f 733 | Кирилла f 734 | Жорес m 735 | Кюри m 736 | Милослава f 737 | Лев m 738 | Джульетта f 739 | Спиридон m 740 | Гордей m 741 | Андриан m 742 | Пров m 743 | Ника f 744 | Милослав m 745 | Богдана f 746 | Анна f 747 | Харитина f 748 | Анита f 749 | Фома m 750 | Конрад m 751 | Остап m 752 | Моисей m 753 | Захар m 754 | Флорентина f 755 | Ахмат m 756 | Герасим m 757 | Мелитина f 758 | Боримир m 759 | Аполлон m 760 | Леонард m 761 | Леся f 762 | Одиссей m 763 | Неонил m 764 | Гаяна f 765 | Каролина f 766 | Василиса f 767 | Дорофей m 768 | Ариадна f 769 | Мечислав m 770 | Ульяна m 771 | Самуил m 772 | Флора f 773 | Ерофей m 774 | Александр m 775 | Афиноген m 776 | Зосим m 777 | Новелла f 778 | Мартын m 779 | Пантелеймон m 780 | Аста f 781 | Наум m 782 | Нифонт m 783 | Демократ m 784 | Свобода f 785 | Энвер m 786 | Рената f 787 | Рауль m 788 | Марлена f 789 | Людвиг m 790 | Тарас m 791 | Либерт m 792 | Светослава f 793 | Марианна f 794 | Артур m 795 | Милонег m 796 | Ефрем m 797 | Богдан m 798 | Раймонд m 799 | Спартак m 800 | Воин m 801 | Тереза f 802 | Жерар m 803 | Овидий m 804 | Декабрина f 805 | Антонина f 806 | Дарьяна f 807 | Эразм m 808 | Юния f 809 | Робинзон m 810 | Исидора f 811 | Трофим m 812 | Дайна f 813 | Иларий m 814 | Матрёна f 815 | Гелена f 816 | Никита m 817 | Теодор m 818 | Милица f 819 | Марсен m 820 | Василина f 821 | Северьян m 822 | Вилий m 823 | Потап m 824 | Кирилл m 825 | Энергий m 826 | Баян m 827 | Палладий m 828 | Павла f 829 | Октябрин m 830 | Сталь m 831 | Томас m 832 | Теймураз m 833 | Альвиан m 834 | Ирм m 835 | Дориан m 836 | Ливия f 837 | Аза f 838 | Аксён m 839 | Иванна f 840 | Агриппина f 841 | Тристан m 842 | Феофания f 843 | Альбина f 844 | Калиса f 845 | Мавра f 846 | Август m 847 | Икар m 848 | Устинья f 849 | Октябрь m 850 | Ной m 851 | Курт m 852 | Электрон m 853 | Михаил m 854 | Тимофей m 855 | Неонила f 856 | Алевтин m 857 | Юпитер m 858 | Пелагея f 859 | Север m 860 | Ростислава f 861 | Демьян m 862 | Рашид m 863 | Фидель m 864 | Вениамин m 865 | Тимур m 866 | Гранит m 867 | Раиса f 868 | Радмила f 869 | Мануил m 870 | Иннокентий m 871 | Демид m 872 | Вольт m 873 | Калерия f 874 | Илларион m 875 | Филипп m 876 | Клавдия f 877 | Раймонда f 878 | Радомир m 879 | Макар m 880 | Энгель m 881 | Донат m 882 | Антон m 883 | Ираида f 884 | Гарри m 885 | Геодар m 886 | Эмма f 887 | Снежана f 888 | Флориана f 889 | Сила m 890 | Рудольф m 891 | Нора f 892 | Новомир m 893 | Зарема f 894 | Эммануил m 895 | Артамон m 896 | Святогор m 897 | Полина f 898 | Евлампий m 899 | Северина f 900 | Гали f 901 | Ирина f 902 | Вилли m 903 | Денис m 904 | Олимпиада f 905 | Адонис m 906 | Фридрих m 907 | Мюд m 908 | Аглая f 909 | Ираклий m 910 | Эйнар m 911 | Флегонт m 912 | Диана f 913 | Эльмира f 914 | Ждана f 915 | Анри m 916 | Бернард m 917 | Владлен m 918 | Афродита f 919 | Диодора f 920 | Онисим m 921 | Валентина f 922 | Остромир m 923 | Елисей m 924 | Вольмир m 925 | Аврора f 926 | Гертруд m 927 | Кир m 928 | Алан m 929 | Бенедикт m 930 | Таира f 931 | Агей m 932 | Карм m 933 | Эрлен m 934 | Агния f 935 | Стратон m 936 | Ермак m 937 | Велимир m 938 | Бажен m 939 | Ипполита f 940 | Инна f 941 | Филарет m 942 | Хиония f 943 | Авелина f 944 | Александра f 945 | Инесса f 946 | Рождеро m 947 | Авксентий m 948 | Арлен m 949 | Констанция f 950 | Порфирий m 951 | Ральф m 952 | Влад m 953 | Алевтина f 954 | Каспар m 955 | Вальтер m 956 | Ифигения f 957 | Леон m 958 | Услада f 959 | Фотина f 960 | Гликерия f 961 | Эвридика f 962 | Инга f 963 | Гелеон m 964 | Илария f 965 | Дмитрий m 966 | Бажена f 967 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irokez/Pyrus/9a5c64991592ca13e0a5726f394fbe2a399501c3/src/__init__.py -------------------------------------------------------------------------------- /src/alg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 19, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | 7 | class Vector: 8 | def __init__(self, iterable): 9 | self.setData(iterable) 10 | 11 | def setData(self, iterable): 12 | self._data = list(iterable) 13 | 14 | def __getitem__(self, i): 15 | return self._data[i] 16 | 17 | def __len__(self): 18 | return len(self._data) 19 | 20 | def __add__(self, term): 21 | data = self._data[:] 22 | if type(term) in (int, float): 23 | for i in range(0, len(data)): 24 | data[i] += term 25 | elif isinstance(term, Vector): 26 | for i in range(0, len(self._data)): 27 | data[i] += term[i] 28 | 29 | return Vector(data) 30 | 31 | def __sub__(self, term): 32 | data = self._data[:] 33 | if type(term) in (int, float): 34 | for i in range(0, len(data)): 35 | data[i] -= term 36 | elif isinstance(term, Vector): 37 | for i in range(0, len(self._data)): 38 | data[i] -= term[i] 39 | 40 | return Vector(data) 41 | 42 | def __iadd__(self, term): 43 | self.setData(self.__add__(term)) 44 | return self 45 | 46 | def __repr__(self): 47 | return '[{0}]'.format(', '.join([str(i) for i in self._data])) 48 | 49 | 50 | class Matrix: 51 | def __init__(self): 52 | self._data = None 53 | 54 | def __radd__(self, term): 55 | pass -------------------------------------------------------------------------------- /src/aot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3 3 | import re 4 | import os 5 | import memcache 6 | 7 | class Morphology: 8 | conn = None 9 | db = None 10 | cache = {} 11 | 12 | def __init__(self, db, lexicon = None): 13 | load = False 14 | if not os.path.exists(db): 15 | load = True 16 | 17 | self.conn = sqlite3.connect(db, check_same_thread = False) 18 | self.mc = memcache.Client(['127.0.0.1:11211'], debug=0) 19 | 20 | # def re_match(a, b): 21 | # return re.match(a, b) is not None 22 | # 23 | # def concat(a, b): 24 | # return a + b 25 | # 26 | # self.conn.create_function('re_match', 2, re_match) 27 | # self.conn.create_function('concat', 2, concat) 28 | 29 | self.db = self.conn.cursor() 30 | 31 | if load: 32 | self.load(lexicon) 33 | 34 | def close(self): 35 | self.db.close() 36 | self.conn.close() 37 | 38 | def skip_lines(self, handle): 39 | line = handle.readline() 40 | if not len(line): 41 | return False 42 | 43 | line = line.strip() 44 | if re.search('^\d+$', line): 45 | for i in range(0, int(line)): 46 | line = handle.readline() 47 | if not len(line): 48 | return False 49 | else: 50 | print(line) 51 | return False 52 | 53 | return True 54 | 55 | def load(self, file): 56 | handle = open(file, 'r', encoding='cp1251') 57 | 58 | # load rules 59 | self.load_rules(handle) 60 | 61 | # skip accents 62 | if not self.skip_lines(handle): 63 | return False 64 | 65 | # skip logs 66 | if not self.skip_lines(handle): 67 | return False 68 | 69 | # skip prefixes 70 | if not self.skip_lines(handle): 71 | return False 72 | 73 | print(self.load_lemmas(handle), 'lemmas loaded') 74 | 75 | handle.close() 76 | 77 | def load_rules(self, handle): 78 | # create table 79 | self.db.execute('''create table rules( 80 | id integer, 81 | prefix text, 82 | suffix text)''') 83 | 84 | lines = handle.readline().strip() 85 | reg_split = re.compile('\\%'); 86 | alf = '\w'; 87 | reg_rule = re.compile('^(?P' + alf + '*)\\*(?P' + alf + '+)(?:\\*(?P' + alf + '+))?$') 88 | 89 | for i in range(0, int(lines)): 90 | line = handle.readline() 91 | if not len(line): 92 | break 93 | 94 | rules = reg_split.split(line.strip()) 95 | 96 | for rule in rules: 97 | match = reg_rule.search(rule) 98 | if match is not None: 99 | record = match.groupdict() 100 | if 'prefix' not in record or record['prefix'] is None: 101 | record['prefix'] = '' 102 | 103 | suffix = record['suffix'].lower() 104 | prefix = record['prefix'].lower() 105 | 106 | self.db.execute('insert into rules (id, prefix, suffix) values (?, ?, ?)', (i, prefix, suffix)) 107 | 108 | self.db.execute('create index rules_id on rules(id)') 109 | return i 110 | 111 | def load_lemmas(self, handle): 112 | # create table 113 | self.db.execute('''create table lemmas( 114 | base text, 115 | rule integer)''') 116 | 117 | lines = int(handle.readline().strip()) 118 | reg_split = re.compile('\s+') 119 | 120 | for i in range(0, lines): 121 | line = handle.readline() 122 | if not len(line): 123 | break 124 | 125 | record = reg_split.split(line) 126 | self.db.execute('insert into lemmas values(?, ?)', (record[0].lower() + '%', int(record[1]))) 127 | 128 | self.db.execute('create index lemmas_base on lemmas(base)') 129 | 130 | return i 131 | 132 | def make_forms(self, lemma): 133 | self.db.execute('select prefix, suffix from rules where id = ?', (lemma['rule'],)) 134 | 135 | forms = [] 136 | for rule in self.db.fetchall(): 137 | forms.append({ 138 | 'base': lemma['base'], 139 | 'form': rule[0] + lemma['base'] + rule[1], 140 | }) 141 | return forms 142 | 143 | def normalize(self, word): 144 | word = word.lower() 145 | 146 | lemmas = self.mc.get(word) 147 | if lemmas is None: 148 | # if word not in self.cache: 149 | self.db.execute('select base, rule from lemmas where ? like base', (word,)) 150 | 151 | lemmas = [] 152 | for lemma in self.db.fetchall(): 153 | base = lemma[0][0:-1] 154 | forms = self.make_forms({'base': base, 'rule': lemma[1]}) 155 | for form in forms: 156 | # print(word, form['form']) 157 | if word == form['form']: 158 | init_form = forms[0]['form'] 159 | lemmas.append(init_form) 160 | 161 | # self.cache[word] = set(lemmas) 162 | lemmas = set(lemmas) 163 | self.mc.set(word, lemmas) 164 | 165 | return lemmas 166 | 167 | if __name__ == '__main__': 168 | from optparse import OptionParser 169 | parser = OptionParser() 170 | parser.usage = '%prog [options]' 171 | parser.add_option('-i', '--index', action='store_const', const=True, dest='index') 172 | parser.add_option('-d', '--database', action='store', type='string', dest='database') 173 | 174 | (options, args) = parser.parse_args() 175 | if options.index: 176 | morphology = Morphology(options.database, args[0]) 177 | print('Done') 178 | else: 179 | morphology = Morphology(options.database) 180 | print(morphology.normalize(args[0])) -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 5, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | import sys 7 | sys.path.append('/home/alexpak/tools/liblinear-1.8/python/') 8 | -------------------------------------------------------------------------------- /src/convert-rnc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 5, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | import sys 9 | import re 10 | 11 | if len(sys.argv) < 2: 12 | print('Usage: convert-rnc.py inputfile > outputfile') 13 | exit() 14 | 15 | skip_lines = [ 16 | '<\?xml version="1.0" encoding="windows-1251"\?>', 17 | '', 18 | '' 19 | ] 20 | re_skip = re.compile('|'.join(skip_lines)) 21 | re_del = re.compile(']+>||') 22 | re_fix1 = re.compile('\s?') 23 | re_fix2 = re.compile('') 24 | 25 | print('') 26 | print('') 27 | f = open(sys.argv[1], 'rb') 28 | for line in f: 29 | line = line.decode('cp1251') 30 | 31 | if re_skip.search(line): 32 | continue 33 | 34 | line = re_del.sub('', line) 35 | line = re_fix1.sub('', line) 36 | line = re_fix2.sub('', line) 37 | 38 | print(line, end = '') 39 | f.close() 40 | print('') -------------------------------------------------------------------------------- /src/create-lexicon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 6, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | import sys 9 | import os 10 | import rnc 11 | import sqlite3 12 | 13 | if len(sys.argv) < 3: 14 | exit('Usage: create-lexicon.py filename dbname') 15 | 16 | dbname = sys.argv[2] 17 | db_exists = os.path.isfile(dbname) 18 | con = sqlite3.connect(dbname) 19 | cur = con.cursor() 20 | 21 | def create_db(): 22 | sql = ''' 23 | create table words( 24 | id integer primary key autoincrement, 25 | lemma text, 26 | form text, 27 | accent integer, 28 | info text, 29 | freq integer 30 | ); 31 | create index words_lemma_form_info_accent on words(lemma, form, info, accent); 32 | ''' 33 | [cur.execute(st) for st in sql.split(';') if len(st.strip())] 34 | 35 | if not db_exists: 36 | create_db() 37 | 38 | sentences = rnc.Reader().read(sys.argv[1]) 39 | for sentence in sentences: 40 | for word in sentence: 41 | accent = word[0].index('`') + 1 if '`' in word[0] else 0 42 | form = word[0].replace('`', '') 43 | lemma = word[1]['lex'] 44 | info = word[1]['gr'] 45 | 46 | cur.execute('select id from words where lemma = ? and form = ? and info = ? and accent = ?', (lemma, form, info, accent)) 47 | row = cur.fetchone() 48 | if row is None: 49 | cur.execute('insert into words (lemma, form, info, accent, freq) values (?, ?, ?, ?, 1)', (lemma, form, info, accent)) 50 | else: 51 | cur.execute('update words set freq = freq + 1 where id = ?', row) 52 | 53 | con.commit() 54 | con.close() -------------------------------------------------------------------------------- /src/dep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Nov 22, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | import sys 9 | import ml 10 | import math 11 | from ml.svm import SVM as Classifier 12 | #from ml.nb import NaiveBayes as Classifier 13 | from collections import Counter, OrderedDict 14 | import sqlite3 15 | import os 16 | import syntagrus 17 | 18 | features = {'m', 'f', 'n', 'nom', 'gen', 'gen2', 'dat', 'acc', 'ins', 'prep', 'loc', 'sg', 'pl', 'real', 'inf', 'advp', 'adjp', 'imp', 'pass', '1p', '2p', '3p'} 19 | 20 | class Linker: 21 | def __init__(self): 22 | self._cl = Classifier() 23 | 24 | def traverse(self, sentences): 25 | x = [] 26 | y = [] 27 | for sentence in sentences: 28 | for w in range(0, len(sentence)): 29 | word_from = sentence[w] 30 | feats = {} 31 | 32 | # meta1 = word_from[1].pos + '_'.join(sorted(word_from[1].feat & features)) 33 | # feats['f:' + meta1] = 1 34 | 35 | for feat in word_from[1].feat & features: 36 | feats['f:' + feat] = 1 37 | feats['fp:' + word_from[1].pos] = 1 38 | feats['fw:' + word_from[0]] = 1 39 | 40 | for v in range(0, len(sentence)): 41 | if v == w: 42 | continue 43 | 44 | word_to = sentence[v] 45 | 46 | feats2 = feats.copy() 47 | # meta2 = word_to[1].pos + '_'.join(sorted(word_to[1].feat & features)) 48 | # feats['t:' + meta2] = 1 49 | 50 | for feat in word_to[1].feat & features: 51 | feats2['t:' + feat] = 1 52 | for feat in word_from[1].feat & word_to[1].feat: 53 | feats2['c:' + feat] = 1 54 | feats2['tp:' + word_to[1].pos] = 1 55 | feats2['tw:' + word_to[0]] = 1 56 | feats2['dst'] = float(w - v) 57 | 58 | ''' 59 | for i in range(1, 3): 60 | u = v - i 61 | if u > 0 or u != w: 62 | continue 63 | word_prev = sentence[u] 64 | for feat in word_prev[1].feat: 65 | feats2[str(i) + 'p:' + feat] = 1 66 | # for feat in word_from[1].feat & word_prev[1].feat: 67 | # feats2[str(i) + 'pfc:' + feat] = 1 68 | # for feat in word_to[1].feat & word_prev[1].feat: 69 | # feats2[str(i) + 'ptc:' + feat] = 1 70 | feats2[str(i) + 'pp:' + word_prev[1].pos] = 1 71 | #feats2[str(i) + 'pw:' + word_prev[0]] = 1 72 | ''' 73 | 74 | 75 | # if word_from[1].dom != word_to[1].id: 76 | # continue 77 | 78 | x.append(feats2) 79 | # y.append(word_from[1].link if word_from[1].dom == word_to[1].id else 'none') 80 | y.append(int(word_from[1].dom == word_to[1].id)) 81 | 82 | #endfor w 83 | #endfor sentence 84 | return (x, y) 85 | 86 | def train(self, sentences): 87 | x, y = self.traverse(sentences) 88 | self._cl.train_regression(x, y) 89 | 90 | def predict(self, sentences): 91 | (test_x, test_y) = self.traverse(sentences) 92 | return (self._cl.predict(test_x), test_y) 93 | 94 | def evaluate_bin(self, gold, test): 95 | tp = 0; fp = 0; tn = 0; fn = 0 96 | 97 | for i in range(0, len(gold)): 98 | if gold[i] != 'none': 99 | if test[i] == gold[i]: 100 | tp += 1 101 | else: 102 | fn += 1 103 | else: 104 | if test[i] == gold[i]: 105 | tn += 1 106 | else: 107 | fp += 1 108 | 109 | 110 | acc = (tp + tn) / (tp + fp + tn + fn) if tp + fp + tn + fn else 0 111 | pr = tp / (tp + fp) if tp + fp else 0 112 | rec = tp / (tp + fn) if tp + fn else 0 113 | f1 = 2 * (pr * rec) / (pr + rec) if pr + rec else 0 114 | 115 | return (acc, pr, rec, f1) 116 | 117 | def evaluate_mul(self, gold, test): 118 | tp = 0; fp = 0; tn = 0; fn = 0; cl = 0 119 | 120 | for i in range(0, len(gold)): 121 | if gold[i] != 'none': 122 | if test[i] != 'none': 123 | tp += 1 124 | else: 125 | fn += 1 126 | 127 | if test[i] == gold[i]: 128 | cl += 1 129 | 130 | else: 131 | if test[i] == 'none': 132 | tn += 1 133 | else: 134 | fp += 1 135 | 136 | acc = cl / (tp + fn) if tp + fn else 0 137 | pr = tp / (tp + fp) if tp + fp else 0 138 | rec = tp / (tp + fn) if tp + fn else 0 139 | f1 = 2 * (pr * rec) / (pr + rec) if pr + rec else 0 140 | 141 | return (acc, pr, rec, f1) 142 | 143 | def test(self, sentences): 144 | (estim_y, test_y) = self.predict(sentences) 145 | print(Counter(test_y)) 146 | print(Counter(estim_y)) 147 | return self.evaluate_mul(test_y, estim_y) 148 | 149 | def save(self, path): 150 | self._cl.save(path) 151 | 152 | @staticmethod 153 | def load(path): 154 | obj = Linker() 155 | obj._cl = ml.Classifier.load(path) 156 | return obj 157 | 158 | def print_table(data, outfile = sys.stdout, maxlen = {}): 159 | vsep = '|' 160 | endl = '\n' 161 | s = '' 162 | 163 | keys = [] 164 | maxkey = 0 165 | for rowkey, row in data.items(): 166 | l = len(str(rowkey)) 167 | if l > maxkey: 168 | maxkey = l 169 | for key in row: 170 | if key not in keys: 171 | keys.append(key) 172 | l = len(str(row[key])) 173 | if key not in maxlen or l > maxlen[key]: 174 | maxlen[key] = l 175 | 176 | for key in keys: 177 | l = len(str(key)) 178 | if l > maxlen[key]: 179 | maxlen[key] = l 180 | if maxlen[key] < 3: 181 | maxlen[key] = 3 182 | 183 | hline = '+' + '-' * maxkey + '+' + '+'.join(['-' * maxlen[key] for key in keys]) + '+' 184 | 185 | s += endl + hline + endl 186 | s += vsep 187 | s += ' ' * maxkey 188 | s += vsep 189 | s += vsep.join([str(key).ljust(maxlen[key]) for key in keys]) 190 | s += vsep 191 | s += endl + hline + endl 192 | 193 | for rowkey, row in data.items(): 194 | s += vsep 195 | s += str(rowkey).ljust(maxkey) 196 | s += vsep 197 | s += vsep.join([str(row[key] if key in row else '').rjust(maxlen[key]) for key in keys]) 198 | s += vsep 199 | s += endl + hline + endl 200 | 201 | print(s, file=outfile) 202 | return maxlen 203 | 204 | class Parser: 205 | def __init__(self, linker): 206 | self._linker = linker 207 | 208 | def parse(self, sentence): 209 | table_estim = OrderedDict() 210 | table_true = OrderedDict() 211 | rowwords = set() 212 | 213 | con = sqlite3.connect('tmp/links') 214 | cur = con.cursor() 215 | 216 | prep = False 217 | for w in range(0, len(sentence)): 218 | source = sentence[w] 219 | source_word = source[0] 220 | 221 | if source_word in rowwords: 222 | source_word += '-' + str(source[1].id) 223 | else: 224 | rowwords.add(source_word) 225 | 226 | table_estim[source_word] = OrderedDict() 227 | table_true[source_word] = OrderedDict() 228 | 229 | source_feat = ' '.join([source[1].pos] + sorted(source[1].feat)) 230 | 231 | # root 232 | target_word = '_root' 233 | if source[1].pos == 'PR': 234 | prep = True 235 | cur.execute('select sum(freq) from links where ffeat = ? and fword = ? and root', (source_feat, source_word)) 236 | else: 237 | cur.execute('select sum(freq) from links where ffeat = ? and root', (source_feat, )) 238 | 239 | table_estim[source_word][target_word] = cur.fetchone()[0] or 0 240 | table_true[source_word][target_word] = 'root' if source[1].dom == 0 else '' 241 | 242 | colwords = set() 243 | no = False 244 | for v in range(0, len(sentence)): 245 | target = sentence[v] 246 | target_word = target[0] 247 | if target_word in colwords: 248 | target_word += '-' + str(target[1].id) 249 | else: 250 | colwords.add(target_word) 251 | 252 | target_feat = ' '.join([target[1].pos] + sorted(target[1].feat)) 253 | if target[1].pos == 'CONJ' or (target[1].pos == 'PR' and source[1].pos in {'S', 'ADV', 'ADJ'}): 254 | cur.execute('select sum(freq) from links where ffeat = ? and tfeat = ? and tword = ?', (source_feat, target_feat, target_word)) 255 | elif source[1].pos == 'CONJ' or (source[1].pos == 'PR' and target[1].pos in {'S', 'ADV', 'ADJ'}): 256 | cur.execute('select sum(freq) from links where ffeat = ? and fword = ? and tfeat = ?', (source_feat, source_word, target_feat)) 257 | else: 258 | cur.execute('select sum(freq) from links where ffeat = ? and tfeat = ?', (source_feat, target_feat)) 259 | # table_estim[word_from][word_to] = '.' if v == w else round((cur.fetchone()[0] or 0) / (math.log(abs(v - w) + 2))) 260 | freq = cur.fetchone()[0] or 0 261 | 262 | if source[1].pos == 'S' and target[1].pos == 'PR' and w > v and prep: 263 | freq = 9999 264 | prep = False 265 | 266 | if source_word == 'не' and w < v and not no: 267 | freq = 9999 268 | no = True 269 | 270 | table_estim[source_word][target_word] = '.' if v == w else freq 271 | table_true[source_word][target_word] = '.' if v == w else source[1].link if source[1].dom == target[1].id else '' 272 | # table_estim[word_from][word_to] = '.' if v == w else 'x' if estim_y[i] else '' 273 | # table_true[word_from][word_to] = '.' if v == w else 'x' if test_y[i] else '' 274 | 275 | maxlen = print_table(table_true) 276 | print_table(table_estim, maxlen = maxlen) 277 | # ''' 278 | for rowkey, row in table_estim.items(): 279 | maxval = max([int(val) if val != '.' else 0 for val in list(row.values())[1:]]) 280 | for key, val in row.items(): 281 | if key == '_root': 282 | continue 283 | if val == '.': 284 | continue 285 | if val < maxval: 286 | table_estim[rowkey][key] = '' 287 | # ''' 288 | print_table(table_estim, maxlen = maxlen) 289 | 290 | def parse0(self, sentence): 291 | estim_y, test_y = self._linker.predict([sentence]) 292 | i = 0 293 | table_estim = OrderedDict() 294 | table_true = OrderedDict() 295 | rowwords = set() 296 | for w in range(0, len(sentence)): 297 | word_from = sentence[w][0] 298 | if word_from in rowwords: 299 | word_from += '-' + str(sentence[w][1].id) 300 | else: 301 | rowwords.add(word_from) 302 | 303 | table_estim[word_from] = OrderedDict() 304 | table_true[word_from] = OrderedDict() 305 | colwords = set() 306 | 307 | for v in range(0, len(sentence)): 308 | word_to = sentence[v][0] 309 | if word_to in colwords: 310 | word_to += '-' + str(sentence[v][1].id) 311 | else: 312 | colwords.add(word_to) 313 | # table_estim[word_from][word_to] = '.' if v == w else round(estim_y[i], 5) if estim_y[i] != 'none' else '' 314 | # table_true[word_from][word_to] = '.' if v == w else test_y[i] if test_y[i] != 'none' else '' 315 | table_estim[word_from][word_to] = '.' if v == w else estim_y[i] if estim_y[i] else '' 316 | table_true[word_from][word_to] = '.' if v == w else 'x' if test_y[i] else '' 317 | i += v != w 318 | 319 | maxlen = print_table(table_true) 320 | print_table(table_estim, maxlen = maxlen) 321 | 322 | genders = {'m', 'f', 'n'} 323 | cases = {'nom', 'gen', 'dat', 'acc', 'ins', 'prep', 'gen2', 'loc'} 324 | animacy = {'anim', 'inan'} 325 | number = {'sg', 'pl'} 326 | person = {'1p', '2p', '3p'} 327 | vtypes = {'perf', 'imperf'} 328 | vmood = {'real', 'imp', 'pass'} 329 | vform = {'inf', 'advj', 'advp'} 330 | tenses = {'pst', 'npst', 'prs'} 331 | degree = {'comp', 'supl'} 332 | 333 | class Links: 334 | def __init__(self, dbname): 335 | self.dbname = dbname 336 | db_exists = os.path.isfile(dbname) 337 | self.con = sqlite3.connect(dbname) 338 | self.cur = self.con.cursor() 339 | 340 | if not db_exists: 341 | self.create_db() 342 | 343 | def create_db(self): 344 | sql = ''' 345 | create table links( 346 | id integer primary key autoincrement, 347 | name text, 348 | 349 | fword text, 350 | ffeat text, 351 | fpos text, 352 | fnum text, 353 | fgen text, 354 | fcase text, 355 | fpers text, 356 | fanim text, 357 | ftype text, 358 | fmood text, 359 | ftens text, 360 | fdegr text, 361 | 362 | tword text, 363 | tfeat text, 364 | tpos text, 365 | tnum text, 366 | tgen text, 367 | tcase text, 368 | tpers text, 369 | tanim text, 370 | ttype text, 371 | tmood text, 372 | ttens text, 373 | tdegr text, 374 | 375 | root integer, 376 | freq integer, 377 | dist integer 378 | ); 379 | create index links_info on links(name, fword, tword, root, ffeat, tfeat, dist); 380 | create index links_info2 on links(ffeat, tfeat); 381 | create index links_info3 on links(ffeat, root); 382 | create index links_info4 on links(ffeat, fword, root); 383 | create index links_info5 on links(ffeat, fword, tfeat); 384 | create index links_info6 on links(ffeat, tfeat, tword); 385 | ''' 386 | 387 | sql0 = ''' 388 | create table links( 389 | id integer primary key autoincrement, 390 | name text, 391 | 392 | fword text, 393 | ffeat text, 394 | 395 | tword text, 396 | tfeat text, 397 | 398 | root integer, 399 | freq integer 400 | ); 401 | create index links_info on links(name, fword, tword, root, ffeat, tfeat); 402 | ''' 403 | [self.cur.execute(st) for st in sql.split(';') if len(st.strip())] 404 | 405 | def index(self, sentences): 406 | for sentence in sentences: 407 | for word_from in sentence: 408 | is_root = 0 409 | if word_from[1].dom: 410 | word_to = sentence[word_from[1].dom - 1] 411 | else: 412 | word_to = ('', syntagrus.word_t(lemma='', pos='', dom='', link='root', id=0, feat=set())) 413 | is_root = 1 414 | from_feat = ' '.join([word_from[1].pos] + sorted(word_from[1].feat)) 415 | to_feat = ' '.join([word_to[1].pos] + sorted(word_to[1].feat)) 416 | 417 | fpos = word_from[1].pos 418 | fnum = (number & word_from[1].feat or {None}).pop() 419 | fgen = (genders & word_from[1].feat or {None}).pop() 420 | fcase = (cases & word_from[1].feat or {None}).pop() 421 | fpers = (person & word_from[1].feat or {None}).pop() 422 | fanim = (animacy & word_from[1].feat or {None}).pop() 423 | ftype = (vtypes & word_from[1].feat or {None}).pop() 424 | fmood = (vmood & word_from[1].feat or {None}).pop() 425 | ftens = (tenses & word_from[1].feat or {None}).pop() 426 | fdegr = (degree & word_from[1].feat or {None}).pop() 427 | 428 | tpos = word_to[1].pos 429 | tnum = (number & word_to[1].feat or {None}).pop() 430 | tgen = (genders & word_to[1].feat or {None}).pop() 431 | tcase = (cases & word_to[1].feat or {None}).pop() 432 | tpers = (person & word_to[1].feat or {None}).pop() 433 | tanim = (animacy & word_to[1].feat or {None}).pop() 434 | ttype = (vtypes & word_to[1].feat or {None}).pop() 435 | tmood = (vmood & word_to[1].feat or {None}).pop() 436 | ttens = (tenses & word_to[1].feat or {None}).pop() 437 | tdegr = (degree & word_to[1].feat or {None}).pop() 438 | 439 | dist = word_to[1].id - word_from[1].id 440 | 441 | self.cur.execute('select id from links where name = ? and fword = ? and tword = ? and root = ? and ffeat = ? and tfeat = ? and dist = ?', (word_from[1].link, word_from[0].lower(), word_to[0].lower(), is_root, from_feat, to_feat, dist)) 442 | row = self.cur.fetchone() 443 | if row is None: 444 | sql = ''' 445 | insert into links (name, fword, tword, root, ffeat, tfeat, freq, dist, 446 | fpos, fnum, fgen, fcase, fpers, fanim, ftype, fmood, ftens, fdegr, 447 | tpos, tnum, tgen, tcase, tpers, tanim, ttype, tmood, ttens, tdegr 448 | ) values (?, ?, ?, ?, ?, ?, 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 449 | ''' 450 | self.cur.execute(sql, (word_from[1].link, word_from[0].lower(), word_to[0].lower(), is_root, from_feat, to_feat, dist, fpos, fnum, fgen, fcase, fpers, fanim, ftype, fmood, ftens, fdegr, tpos, tnum, tgen, tcase, tpers, tanim, ttype, tmood, ttens, tdegr)) 451 | else: 452 | self.cur.execute('update links set freq = freq + 1 where id = ?', row) 453 | 454 | def close(self): 455 | self.con.commit() 456 | self.con.close() 457 | 458 | if __name__ == '__main__': 459 | import glob 460 | from optparse import OptionParser 461 | import syntagrus 462 | 463 | parser = OptionParser() 464 | parser.usage = '%prog [options]' 465 | 466 | (options, args) = parser.parse_args() 467 | 468 | if not len(args): 469 | files = glob.glob('res/*/*/*.tgt') 470 | corpus = [] 471 | for file in files[0:10]: 472 | R = syntagrus.Reader() 473 | sentences = R.read(file) 474 | corpus.extend(sentences) 475 | del(R) 476 | 477 | fold_size = round(len(corpus) / 10) 478 | 479 | train_set = corpus[0:-fold_size] 480 | test_set = corpus[-fold_size:] 481 | 482 | print('{0} sentences'.format(len(corpus))) 483 | 484 | del(corpus) 485 | 486 | ''' 487 | 488 | L = Links('tmp/links') 489 | L.index(train_set) 490 | L.close() 491 | exit() 492 | ''' 493 | 494 | ''' 495 | 496 | L = Linker() 497 | L.train(train_set) 498 | # results = L.test(test_set) 499 | # print('Accuracy = {0[0]:.3f}, precision = {0[1]:.3f}, recall = {0[2]:.3f}, F1 = {0[3]:.3f}'.format(results)) 500 | ''' 501 | L = None 502 | 503 | P = Parser(L) 504 | example = test_set[6] #6 505 | for word in example: 506 | print(word) 507 | P.parse(example) 508 | -------------------------------------------------------------------------------- /src/liblinear.patch: -------------------------------------------------------------------------------- 1 | --- /home/alexpak/tools/liblinear-1.8/python/liblinearutil.py 2011-08-04 00:58:24.017288401 +0200 2 | +++ /home/alexpak/tools/tmp/liblinear-1.8/python/liblinearutil.py 2011-03-05 09:46:26.000000000 +0100 3 | @@ -30,7 +30,7 @@ 4 | 5 | Load a LIBLINEAR model from model_file_name and return. 6 | """ 7 | - model = liblinear.load_model(model_file_name.encode('utf-8')) 8 | + model = liblinear.load_model(model_file_name) 9 | if not model: 10 | print("can't open model file %s" % model_file_name) 11 | return None 12 | @@ -43,7 +43,7 @@ 13 | 14 | Save a LIBLINEAR model to the file model_file_name. 15 | """ 16 | - liblinear.save_model(model_file_name.encode('utf-8'), model) 17 | + liblinear.save_model(model_file_name, model) 18 | 19 | def evaluations(ty, pv): 20 | """ 21 | @@ -204,7 +204,7 @@ 22 | y = [0] * len(x) 23 | ACC = evaluations(y, pred_labels) 24 | l = len(y) 25 | - #print("Accuracy = %g%% (%d/%d)" % (ACC, int(l*ACC//100), l)) 26 | + print("Accuracy = %g%% (%d/%d)" % (ACC, int(l*ACC//100), l)) 27 | 28 | return pred_labels, ACC, pred_values 29 | 30 | -------------------------------------------------------------------------------- /src/ml/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | class Classifier: 4 | def train(self, x, y): 5 | pass 6 | 7 | def predict(self, x): 8 | pass 9 | 10 | def evaluate(self, gold, test): 11 | tp = 0; fp = 0 12 | 13 | for i in range(0, len(gold)): 14 | if test[i] == gold[i]: 15 | tp += 1 16 | else: 17 | fp += 1 18 | 19 | acc = tp / (tp + fp) if tp + fp else 0 20 | 21 | return (acc, ) 22 | 23 | def evaluate_bin(self, gold, test, true_class): 24 | tp = 0; fp = 0; tn = 0; fn = 0 25 | 26 | for i in range(0, len(gold)): 27 | if gold[i] == true_class: 28 | if test[i] == gold[i]: 29 | tp += 1 30 | else: 31 | fn += 1 32 | else: 33 | if test[i] == gold[i]: 34 | tn += 1 35 | else: 36 | fp += 1 37 | 38 | acc = (tp + tn) / (tp + fp + tn + fn) if tp + fp + tn + fn else 0 39 | pr = tp / (tp + fp) if tp + fp else 0 40 | rec = tp / (tp + fn) if tp + fn else 0 41 | prn = tn / (tn + fn) if tn + fn else 0 42 | f1 = 2 * (pr * rec) / (pr + rec) if pr + rec else 0 43 | 44 | return (acc, pr, rec, f1, prn) 45 | 46 | 47 | def save(self, path): 48 | f = open(path, 'wb') 49 | pickle.dump(self, f) 50 | f.close() 51 | 52 | @staticmethod 53 | def load(path): 54 | f = open(path, 'rb') 55 | obj = pickle.load(f) 56 | f.close() 57 | return obj 58 | 59 | class Autoincrement: 60 | def __init__(self): 61 | self._ids = {} 62 | self._inv = {} 63 | 64 | def setId(self, val): 65 | if val not in self._ids: 66 | self._ids[val] = len(self._ids) + 1 67 | self._inv[self._ids[val]] = val 68 | 69 | return self._ids[val] 70 | 71 | def getId(self, val): 72 | return self._ids[val] if val in self._ids else 0 73 | 74 | def getVal(self, id): 75 | return self._inv[id] if id in self._inv else None 76 | 77 | def count(self): 78 | return len(self._ids) 79 | 80 | class FeatureSpace: 81 | def __init__(self): 82 | self.featureset = {} 83 | self.start = 0 84 | self.default_size = int(1e5) 85 | 86 | def add(self, featureset, size = None): 87 | if size is None: 88 | size = self.default_size 89 | 90 | for (feature, value) in featureset.items(): 91 | self.featureset[feature + self.start] = value 92 | 93 | self.start += size -------------------------------------------------------------------------------- /src/ml/nb.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 17, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | 7 | import math 8 | from collections import defaultdict 9 | from .. import ml 10 | 11 | class NaiveBayes(ml.Classifier): 12 | def __init__(self): 13 | pass 14 | 15 | def __repr__(self): 16 | return 'NaiveBayes' 17 | 18 | def _gaus(self, i, mean, var): 19 | return (1 / math.sqrt(2 * math.pi * var) * math.exp(- (i - mean) ** 2 / (2 * var))) if var > 0 else float(1 if i == mean else 0) 20 | 21 | 22 | def _prob(self, C, dim, val): 23 | p = 0 24 | if dim in self._P[C]: 25 | p = self._P[C][dim] 26 | elif dim in self._F[C]: 27 | p = self._gaus(val, self._F[C][dim][0], self._F[C][dim][1]) 28 | 29 | return p 30 | 31 | def train(self, x, y): 32 | data = defaultdict(list) 33 | labels = set() 34 | discrete_features = set() 35 | numeric_features = set() 36 | i = 0 37 | for C in y: 38 | labels.add(C) 39 | data[C].append(x[i]) 40 | for dim in x[i]: 41 | if isinstance(x[i][dim], float): 42 | numeric_features.add(dim) 43 | else: 44 | discrete_features.add(dim) 45 | i += 1 46 | 47 | ndim = len(discrete_features) 48 | 49 | # train discrete features 50 | P = {} 51 | for C in data: 52 | count = defaultdict(int) 53 | total = 0 54 | for sample in data[C]: 55 | for dim, val in sample.items(): 56 | if dim in discrete_features: 57 | count[dim] += val 58 | total += val 59 | 60 | P[C] = {} 61 | for dim in discrete_features: 62 | P[C][dim] = (1 + count[dim]) / (ndim + total) 63 | self._P = P 64 | 65 | # train numeric features 66 | F = {} 67 | for C in data: 68 | F[C] = {} 69 | n = 0 70 | for dim in numeric_features: 71 | n += 1 72 | mean = 0; var = 0; N = 0 73 | 74 | # calculate mean and length 75 | for sample in data[C]: 76 | mean += sample[dim] if dim in sample else 0 77 | N += 1 78 | mean /= N 79 | 80 | # calculate variance 81 | for sample in data[C]: 82 | var += (mean - (sample[dim] if dim in sample else 0)) ** 2 83 | var /= (N - 1) if N > 1 else N 84 | 85 | F[C][dim] = (mean, var) 86 | 87 | self._F = F 88 | 89 | def predict(self, x, return_likelihood = False): 90 | y = [] 91 | for sample in x: 92 | L = defaultdict(float) 93 | for C in self._P: 94 | for dim in sample: 95 | L[C] += math.log(self._prob(C, dim, sample[dim]) or 1) 96 | 97 | # y.append(max(L.keys(), key = lambda i: L[i]) if len(L) else next(iter(self._P))) 98 | if return_likelihood: 99 | y.append(L) 100 | else: 101 | y.append(max(L.keys(), key = lambda i: L[i]) if len(L) else None) 102 | 103 | return y 104 | 105 | -------------------------------------------------------------------------------- /src/ml/nn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 18, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | 7 | import ml 8 | import math 9 | import random 10 | from collections import defaultdict 11 | 12 | random.seed() 13 | 14 | class Perceptron(ml.Classifier): 15 | def __init__(self, Nh): 16 | self.Nh = Nh 17 | 18 | self._labels = ml.Autoincrement() 19 | self._features = ml.Autoincrement() 20 | 21 | def _init(self, Ni, Nh, No): 22 | self.momentum = 0.9 23 | self.learn_rate = 0.5 24 | 25 | self._Wh = [[self._seed() for _ in range(0, Ni)] for __ in range(0, Nh)] 26 | self._Wo = [[self._seed() for _ in range(0, Nh)] for __ in range(0, No)] 27 | 28 | self._dWh = [[0] * Ni] * Nh 29 | self._dWo = [[0] * Nh] * No 30 | 31 | def get_class_id(self, C): 32 | if C not in self._class_ids: 33 | self._class_ids[C] = len(self._class_ids) 34 | 35 | def _seed(self): 36 | return (random.random() - 0.5) 37 | 38 | def _sigmod(self, x): 39 | return x 40 | return 1 / (1 + math.exp(-x)) 41 | 42 | def _calc_layer(self, input, W): 43 | output = [] 44 | for i in range(0, len(W)): 45 | s = 0 46 | for j in range(0, len(W[i])): 47 | s += W[i][j] * input[j] 48 | output.append(self._sigmod(s)) 49 | 50 | return output 51 | 52 | def _propagate(self, input): 53 | self._pi = input 54 | self._ph = self._calc_layer(self._pi, self._Wh) 55 | self._po = self._calc_layer(self._ph, self._Wo) 56 | return self._po 57 | 58 | def _backpropagate(self, output): 59 | # delta's for output layer 60 | do = [] 61 | for i in range(0, len(self._Wo)): 62 | print(output[i], self._po[i]) 63 | do.append(self._po[i] * (1 - self._po[i]) * (output[i] - self._po[i])) 64 | # print(do) 65 | 66 | # correct output layer weights 67 | for i in range(0, len(self._Wo)): 68 | for j in range(0, len(self._Wo[i])): 69 | self._dWo[i][j] = self.momentum * self._dWo[i][j] + (1 - self.momentum) * self.learn_rate * do[i] * self._ph[j] 70 | self._Wo[i][j] += self._dWo[i][j] 71 | 72 | # delta's for hidden layer 73 | dh = [] 74 | for i in range(0, len(self._Wh)): 75 | d = 0 76 | for j in range(0, len(self._Wo)): 77 | d += do[j] * self._Wo[j][i] 78 | d *= self._ph[i] * (1 - self._ph[i]) 79 | dh.append(d) 80 | # print(dh) 81 | 82 | # correct hidden layer weights 83 | for i in range(0, len(self._Wh)): 84 | for j in range(0, len(self._Wh[i])): 85 | self._dWh[i][j] = self.momentum * self._dWh[i][j] + (1 - self.momentum) * self.learn_rate * dh[i] * self._pi[j] 86 | self._Wh[i][j] += self._dWh[i][j] 87 | 88 | print(self._Wo) 89 | print(self._Wh) 90 | print() 91 | 92 | def train(self, x, y): 93 | labels = [self._labels.setId(C) for C in y] 94 | data = [] 95 | for sample in x: 96 | data.append(defaultdict(float, [(self._features.setId(d), sample[d]) for d in sample])) 97 | 98 | self._init(self._features.count(), self.Nh, self._labels.count()) 99 | 100 | epsilon = 1e-3 101 | for epoch in range(1, 10): 102 | i = 0 103 | error = 0 104 | for sample in data: 105 | output = self._propagate(sample) 106 | target = defaultdict(float) 107 | target[labels[i] - 1] = 1 108 | self._backpropagate(target) 109 | for j in range(0, len(output)): 110 | error += (output[j] - target[j]) ** 2 111 | 112 | i += 1 113 | 114 | print(error) 115 | print() 116 | if error < epsilon: 117 | break 118 | 119 | def predict(self, x): 120 | y = [] 121 | for sample in x: 122 | output = self._propagate(defaultdict(float, [(self._features.getId(d), sample[d]) for d in sample])) 123 | which_max = max(range(0, len(output)), key = lambda i: output[i]) 124 | y.append(self._labels.getVal(which_max + 1)) 125 | 126 | return y -------------------------------------------------------------------------------- /src/ml/svm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 18, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | import sys 7 | sys.path.append('/home/alexpak/tools/liblinear-1.8/python/') 8 | 9 | import liblinearutil as liblinear 10 | from .. import ml 11 | import pickle 12 | 13 | class SVM(ml.Classifier): 14 | def __init__(self): 15 | self._labels = ml.Autoincrement() 16 | self._features = ml.Autoincrement() 17 | self._regression = False 18 | 19 | def __repr__(self): 20 | return 'SVM' 21 | 22 | def save(self, path): 23 | liblinear.save_model(path + '-model', self._model) 24 | del(self._model) 25 | ml.Classifier.save(self, path) 26 | 27 | @staticmethod 28 | def load(path): 29 | obj = ml.Classifier.load(path) 30 | obj._model = liblinear.load_model(path + '-model') 31 | return obj 32 | 33 | def train(self, x, y, biased = False): 34 | data = [] 35 | for sample in x: 36 | data.append(dict([(self._features.setId(d), sample[d]) for d in sample])) 37 | 38 | labels = [self._labels.setId(C) for C in y] 39 | if self._labels.count() == 2: 40 | labels = [1 if label == 1 else -1 for label in labels] 41 | param = liblinear.parameter('-c 1 -s 2 -q' + (' -B {0}'.format(biased) if biased else '')) 42 | else: 43 | param = liblinear.parameter('-c 1 -s 4 -q' + (' -B {0}'.format(biased) if biased else '')) 44 | prob = liblinear.problem(labels, data) 45 | self._model = liblinear.train(prob, param) 46 | 47 | def train_regression(self, x, y): 48 | data = [] 49 | for sample in x: 50 | data.append(dict([(self._features.setId(d), sample[d]) for d in sample])) 51 | 52 | self._regression = True 53 | param = liblinear.parameter('-c 1 -s 0') 54 | prob = liblinear.problem(y, data) 55 | self._model = liblinear.train(prob, param) 56 | 57 | def predict(self, x): 58 | y = [] 59 | for sample in x: 60 | data = dict([(self._features.getId(d), sample[d]) for d in sample if self._features.getId(d)]) 61 | label, _, _ = liblinear.predict([0], [data], self._model, '') 62 | if self._regression: 63 | y.append(label[0]) 64 | else: 65 | if self._labels.count() == 2: 66 | label[0] = 1 if label[0] == 1 else 2 67 | y.append(self._labels.getVal(label[0])) 68 | 69 | return y -------------------------------------------------------------------------------- /src/morph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Nov 21, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | import sys 9 | 10 | from yatk import ml 11 | from yatk.ml import svm 12 | from yatk.ml.svm import SVM as Classifier 13 | from collections import Counter 14 | 15 | sys.modules['ml'] = ml 16 | sys.modules['ml.svm'] = svm 17 | 18 | def intersects_classes(classes): 19 | return lambda w: (w[1].feat & classes).pop() 20 | 21 | def intersects_classes_or_none(classes, none): 22 | return lambda w: (w[1].feat & classes or {none}).pop() 23 | 24 | def has_classes(pos, classes): 25 | return lambda w: w[1].pos == pos and w[1].feat & classes 26 | 27 | def pos_equals(pos): 28 | return lambda w: w[1].pos == pos 29 | 30 | def has_class(a_class): 31 | return lambda w: int(a_class in w[1].feat) 32 | 33 | pos = {'S', 'A', 'V', 'ADV', 'NID', 'NUM', 'PR', 'PART', 'CONJ', 'COM', 'INTJ', 'P', 'UNK'} 34 | 35 | genders = {'m', 'f', 'n'} 36 | cases = {'nom', 'gen', 'dat', 'acc', 'ins', 'prep', 'gen2', 'loc'} 37 | animacy = {'anim', 'inan'} 38 | number = {'sg', 'pl'} 39 | person = {'1p', '2p', '3p'} 40 | vtypes = {'perf', 'imperf'} 41 | vmood = {'real', 'imp', 'pass'} 42 | vform = {'inf', 'advj', 'advp'} 43 | tenses = {'pst', 'npst', 'prs'} 44 | degree = {'comp', 'supl'} 45 | 46 | cats = [ 47 | ('pos', lambda w: True, lambda w: w[1].pos), 48 | 49 | ('s-gender', has_classes('S', genders), intersects_classes(genders)), 50 | ('s-case', has_classes('S', cases), intersects_classes(cases)), 51 | ('s-animacy', has_classes('S', animacy), intersects_classes(animacy)), 52 | ('s-number', has_classes('S', number), intersects_classes(number)), 53 | 54 | ('v-form', pos_equals('V'), intersects_classes_or_none(vform, 'pers')), 55 | ('v-person', has_classes('V', person), intersects_classes(person)), 56 | ('v-number', has_classes('V', number), intersects_classes(number)), 57 | ('v-gender', has_classes('V', genders), intersects_classes(genders)), 58 | ('v-type', has_classes('V', vtypes), intersects_classes(vtypes)), 59 | ('v-tense', has_classes('V', tenses), intersects_classes(tenses)), 60 | ('v-mood', has_classes('V', vmood), intersects_classes(vmood)), 61 | 62 | ('vadj-number', has_classes('VADJ', number), intersects_classes(number)), 63 | ('vadj-gender', has_classes('VADJ', genders), intersects_classes(genders)), 64 | ('vadj-type', has_classes('VADJ', vtypes), intersects_classes(vtypes)), 65 | ('vadj-tense', has_classes('VADJ', tenses), intersects_classes(tenses)), 66 | ('vadj-mood', has_classes('VADJ', vmood), intersects_classes(vmood)), 67 | ('vadj-case', has_classes('VADJ', cases), intersects_classes(cases)), 68 | 69 | ('a-gender', has_classes('A', genders), intersects_classes(genders)), 70 | ('a-case', has_classes('A', cases), intersects_classes(cases)), 71 | ('a-number', has_classes('A', number), intersects_classes(number)), 72 | ('a-degree', pos_equals('A'), intersects_classes_or_none(degree, 'ncomp')), 73 | ('a-short', pos_equals('A'), has_class('shrt')), 74 | ('a-animacy', has_classes('A', animacy), intersects_classes(animacy)), 75 | 76 | ('adv-comp', pos_equals('ADV'), intersects_classes_or_none(degree, 'ncomp')), 77 | 78 | ('num-gender', has_classes('NUM', genders), intersects_classes(genders)), 79 | ('num-case', has_classes('NUM', cases), intersects_classes(cases)), 80 | ('num-number', has_classes('NUM', number), intersects_classes(number)), 81 | ('num-degree', pos_equals('NUM'), intersects_classes_or_none(degree, 'ncomp')), 82 | ] 83 | 84 | class Guesser: 85 | def __init__(self): 86 | self._cl = Classifier() 87 | 88 | def is_candidate(self, word): 89 | return True 90 | 91 | def make_class(self, word): 92 | pass 93 | 94 | def traverse(self, sentences): 95 | x = [] 96 | y = [] 97 | for sentence in sentences: 98 | for w in range(0, len(sentence)): 99 | word = sentence[w] 100 | 101 | if not self.is_candidate(word): 102 | continue 103 | 104 | x.append(self.gen_features(sentence, w)) 105 | y.append(self.make_class(word)) 106 | 107 | return (x, y) 108 | 109 | def train(self, sentences): 110 | (train_x, train_y) = self.traverse(sentences) 111 | self._cl.train(train_x, train_y) 112 | 113 | def predict(self, sentences): 114 | (test_x, test_y) = self.traverse(sentences) 115 | return (self._cl.predict(test_x), test_y) 116 | 117 | def test(self, sentences): 118 | (estim_y, test_y) = self.predict(sentences) 119 | return self._cl.evaluate(test_y, estim_y) 120 | 121 | def guess(self, word): 122 | return self._cl.predict([self.gen_features([(word,)], 0)])[0] 123 | 124 | def gen_features(self, sentence, w): 125 | word = sentence[w][0] 126 | x = {} 127 | 128 | x['p3:' + word[0:3]] = 1 129 | x['p4:' + word[0:4]] = 1 130 | x['p5:' + word[0:5]] = 1 131 | x['p6:' + word[0:6]] = 1 132 | # x['s1:' + word[-1:]] = 1 133 | x['s2:' + word[-2:]] = 1 134 | x['s3:' + word[-3:]] = 1 135 | x['s4:' + word[-4:]] = 1 136 | x['s5:' + word[-5:]] = 1 137 | x['w:' + word] = 1 138 | 139 | for i in range(1, 4): 140 | if w > i - 1: 141 | word = sentence[w - i][0] 142 | # x[str(i) + 'p3:' + prev[0:3]] = 1 143 | # x[str(i) + 'p4:' + prev[0:4]] = 1 144 | # x[str(i) + 'p5:' + prev[0:5]] = 1 145 | # x[str(i) + 'p6:' + prev[0:6]] = 1 146 | # x['s1:' + word[-1:]] = 1 147 | x[str(i) + 's2:' + word[-2:]] = 1 148 | x[str(i) + 's3:' + word[-3:]] = 1 149 | x[str(i) + 's4:' + word[-4:]] = 1 150 | # x[str(i) + 's5:' + prev[-5:]] = 1 151 | x[str(i) + 'w:' + word] = 1 152 | 153 | for i in range(1, 2): 154 | if w + i < len(sentence) - 1: 155 | word = sentence[w + i][0] 156 | # x[str(i) + 'p3:' + prev[0:3]] = 1 157 | # x[str(i) + 'p4:' + prev[0:4]] = 1 158 | # x[str(i) + 'p5:' + prev[0:5]] = 1 159 | # x[str(i) + 'p6:' + prev[0:6]] = 1 160 | # x['s1:' + word[-1:]] = 1 161 | x[str(i) + '+s2:' + word[-2:]] = 1 162 | x[str(i) + '+s3:' + word[-3:]] = 1 163 | x[str(i) + '+s4:' + word[-4:]] = 1 164 | # x[str(i) + 's5:' + prev[-5:]] = 1 165 | x[str(i) + '+w:' + word] = 1 166 | 167 | return x 168 | 169 | def save(self, path): 170 | self._cl.save(path) 171 | 172 | @staticmethod 173 | def load(path): 174 | obj = Guesser() 175 | obj._cl = Classifier.load(path) 176 | return obj 177 | 178 | class Tagger: 179 | def __init__(self): 180 | self._pos = Guesser.load('res/model/pos') 181 | self._guesser = {} 182 | for cat in cats: 183 | self._guesser[cat[0]] = Guesser.load('res/model/' + cat[0]) 184 | 185 | def label(self, sentence): 186 | tagged = self._pos.predict([sentence])[0] 187 | feats = {} 188 | for cat, guesser in self._guesser.items(): 189 | feats[cat] = guesser.predict([sentence])[0] 190 | 191 | labeled = [] 192 | for w in range(0, len(sentence)): 193 | pos = tagged[w] 194 | feat = [] 195 | cats = [] 196 | if pos == 'S': 197 | cats = ['s-number', 's-case', 's-animacy'] 198 | if True or feats['s-number'][w] == 'sg': 199 | feat.append(feats['s-gender'][w]) 200 | elif pos == 'A': 201 | cats = ['a-number', 'a-degree'] 202 | if feats['a-short'][w]: 203 | feat.append('shrt') 204 | else: 205 | feat.append(feats['a-case'][w]) 206 | if feats['a-number'][w] == 'sg': 207 | feat.append(feats['a-gender'][w]) 208 | elif pos == 'NUM': 209 | cats = ['num-gender', 'num-number', 'num-case', 'num-degree'] 210 | elif pos == 'V': 211 | cats = ['v-number', 'v-tense', 'v-mood', 'v-type'] 212 | if feats['v-tense'][w] == 'pst': 213 | if feats['v-number'][w] == 'sg': 214 | feat.append(feats['v-gender'][w]) 215 | else: 216 | feat.append(feats['v-person'][w]) 217 | 218 | elif pos == 'VINF': 219 | cats = ['v-type'] 220 | elif pos == 'VADV': 221 | cats = ['v-type', 'v-tense'] 222 | elif pos == 'VADJ': 223 | cats = ['vadj-number', 'vadj-gender', 'vadj-tense', 'vadj-type', 'vadj-mood'] 224 | if feats['a-short'][w]: 225 | feat.append('shrt') 226 | else: 227 | feat.append(feats['vadj-case'][w]) 228 | feat.append(feats['a-degree'][w]) 229 | if feats['vadj-number'][w] == 'sg': 230 | feat.append(feats['vadj-gender'][w]) 231 | elif pos == 'ADV': 232 | cats = ['adv-comp'] 233 | 234 | for cat in cats: 235 | feat.append(feats[cat][w]) 236 | 237 | featset = set(feat) - {'ncomp'} 238 | 239 | labeled.append((sentence[w][0], pos, featset)) 240 | 241 | return labeled 242 | 243 | if __name__ == '__main__': 244 | import glob 245 | from optparse import OptionParser 246 | import syntagrus 247 | 248 | parser = OptionParser() 249 | parser.usage = '%prog [options]' 250 | 251 | (options, args) = parser.parse_args() 252 | 253 | if not len(args): 254 | files = glob.glob('res/*/*/*.tgt') 255 | corpus = [] 256 | for file in files: 257 | R = syntagrus.Reader() 258 | sentences = R.read(file) 259 | corpus.extend(sentences) 260 | del(R) 261 | 262 | print(len(corpus)) 263 | 264 | fold_size = round(len(corpus) / 2) 265 | 266 | train_set = corpus[0:-fold_size] 267 | test_set = corpus[-fold_size:] 268 | del(corpus) 269 | 270 | for cat in cats: 271 | G = Guesser() 272 | G.is_candidate = cat[1] 273 | G.make_class = cat[2] 274 | G.train(train_set) 275 | results = G.test(test_set) 276 | G.save('res/model/' + cat[0]) 277 | del(G) 278 | print('{0}\t\t{1:.3f}%'.format(cat[0], results[0] * 100)) 279 | 280 | else: 281 | T = Tagger() 282 | print('Loaded') 283 | words = args[0].split(' ') 284 | sentence = [] 285 | for word in words: 286 | sentence.append((word, tuple())) 287 | 288 | labeled = T.label(sentence) 289 | for word in labeled: 290 | print(word) 291 | -------------------------------------------------------------------------------- /src/mstparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Nov 29, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | if __name__ == '__main__': 9 | import glob 10 | from optparse import OptionParser 11 | import syntagrus 12 | import sys 13 | import morph 14 | 15 | parser = OptionParser() 16 | parser.usage = '%prog [options]' 17 | parser.add_option('-t', '--train', action='store_const', const=True, dest='train', help='generate train file') 18 | parser.add_option('-T', '--test', action='store_const', const=True, dest='test', help='generate test file') 19 | parser.add_option('-n', '--number', action='store', dest='number', type='int', help='number of files to process') 20 | parser.add_option('-f', '--format', action='store', dest='format', type='string', help='output format') 21 | parser.add_option('-M', '--nomorph', action='store_const', const=True, dest='nomorph', help='do not use morphology from annotations') 22 | 23 | (options, args) = parser.parse_args() 24 | 25 | if not options.train and not options.test: 26 | print('Specify --train or --test', file=sys.stderr) 27 | exit() 28 | 29 | if not options.number: 30 | print('Specify number of files -n', file=sys.stderr) 31 | exit() 32 | 33 | if not len(args): 34 | files = glob.glob('res/*/*/*.tgt') 35 | corpus = [] 36 | for file in files[0:options.number]: 37 | R = syntagrus.Reader() 38 | sentences = R.read(file) 39 | corpus.extend(sentences) 40 | del(R) 41 | 42 | fold_size = round(len(corpus) / 10) 43 | 44 | train_set = corpus[0:-fold_size] 45 | test_set = corpus[-fold_size:] 46 | 47 | print('{0} sentences'.format(len(corpus)), file=sys.stderr) 48 | 49 | del(corpus) 50 | 51 | a_set = test_set if options.test else train_set 52 | 53 | if options.nomorph: 54 | Tagger = morph.Tagger() 55 | for sentence in a_set: 56 | labeled = Tagger.label(sentence) 57 | for w in range(0, len(sentence)): 58 | sentence[w] = (sentence[w][0], sentence[w][1]._replace(pos=labeled[w][1], feat=labeled[w][2])) 59 | 60 | selected_feat = {'m', 'f', 'n', 'sg', 'pl', '1p', '2p', '3p', 'nom', 'gen', 'gen2', 'dat', 'acc', 'ins', 'prep', 'loc', 'real', 'imp', 'pass', 'comp', 'shrt'} 61 | #selected_feat = {'nom', 'gen', 'gen2', 'dat', 'acc', 'ins', 'prep', 'loc', 'real', 'imp', 'pass', 'comp', 'shrt'} 62 | 63 | if options.format == 'malt': 64 | # Malt TAB format 65 | for sentence in a_set: 66 | for word in sentence: 67 | w = word[0] or 'FANTOM' 68 | p = '.'.join([word[1].pos] + sorted(word[1].feat & selected_feat)) 69 | l = word[1].link if word[1].dom else 'ROOT' 70 | d = str(word[1].dom) 71 | print('\t'.join([w, p, d, l])) 72 | print('') 73 | 74 | else: 75 | # MSTParser format 76 | for sentence in a_set: 77 | wn = [] 78 | pn = [] 79 | ln = [] 80 | dn = [] 81 | for word in sentence: 82 | wn.append(word[0] or 'FANTOM') 83 | pn.append('-'.join([word[1].pos] + sorted(word[1].feat & selected_feat))) 84 | ln.append(word[1].link if word[1].dom else 'ROOT') 85 | dn.append(str(word[1].dom)) 86 | 87 | print('\t'.join(wn)) 88 | print('\t'.join(pn)) 89 | print('\t'.join(ln)) 90 | print('\t'.join(dn)) 91 | print('') 92 | -------------------------------------------------------------------------------- /src/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | WHITE_SPACE = {' ', '\t'} 2 | LINE_BREAK = {'\n', '\r'} 3 | 4 | class RuleError(Exception): 5 | def __init__(self, msg, line, pos): 6 | self.msg = msg 7 | self.line = line 8 | self.pos = pos 9 | 10 | def __str__(self): 11 | return repr('{0} at {1}:{2}'.format(self.msg, self.line, self.pos)) 12 | 13 | def read_rules(str): 14 | rules = [] 15 | rule = [] 16 | token = '' 17 | quoted = False 18 | line_num = 0 19 | char_num = 0 20 | for ch in str: 21 | if ch in WHITE_SPACE | LINE_BREAK: 22 | if len(token): 23 | rule.append(token) 24 | token = '' 25 | 26 | if ch in LINE_BREAK: 27 | if quoted: 28 | raise RuleError('Unexpected line break', line_num, char_num) 29 | 30 | line_num += 1 31 | char_num = 0 32 | if len(rule): 33 | rules.append(rule) 34 | rule = [] 35 | 36 | elif ch == '=' and not quoted: 37 | if len(token): 38 | rule.append(token) 39 | token = '' 40 | 41 | if len(rule) > 1: 42 | raise RuleError('Unexpected "="', line_num, char_num) 43 | 44 | elif ch == '"': 45 | if quoted: 46 | rule.append(token) 47 | token = '' 48 | 49 | quoted = not quoted 50 | 51 | elif ch == '|' and not quoted: 52 | if len(token): 53 | rule.append(token) 54 | token = '' 55 | 56 | if len(rule) < 1: 57 | raise RuleError('Unexpected "|"', line_num, char_num) 58 | 59 | rules.append(rule) 60 | rule = [rule[0]] 61 | 62 | else: 63 | token += ch 64 | 65 | char_num += 1 66 | 67 | return rules 68 | -------------------------------------------------------------------------------- /src/parsers/cyk.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 6, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | 7 | from collections import defaultdict 8 | 9 | class CYK: 10 | literals = set() 11 | nonterminal = set() 12 | rules = [] 13 | 14 | index = defaultdict(set) 15 | rindex = defaultdict(set) 16 | 17 | def __init__(self, grammar): 18 | candidates = set() 19 | for rule in grammar: 20 | self.rules.append(rule) 21 | self.nonterminal.add(rule[0]) 22 | 23 | if len(rule) == 2: 24 | candidates.add(rule[1]) 25 | 26 | self.literals = candidates - self.nonterminal 27 | 28 | for n in range(0, len(self.rules)): 29 | rule = self.rules[n] 30 | if len(rule) == 2: 31 | self.rindex[rule[1]].add(n) 32 | 33 | self.index[rule[0]].add(n) 34 | 35 | def tokenize(self, str): 36 | return str.split(' ') 37 | 38 | def parse(self, str): 39 | tokens = self.tokenize(str) 40 | 41 | P = {}; 42 | len_tokens = len(tokens) 43 | 44 | # returns positions of matching components or empty list 45 | def match(rule, start, length): 46 | if len(rule) == 1: 47 | result = [length] if rule[0] in P[start][length] else [] 48 | else: 49 | result = [] 50 | for l in range(1, length): 51 | if start + l > len_tokens: 52 | break 53 | 54 | if rule[0] not in P[start][l]: 55 | continue 56 | 57 | tail = match(rule[1:], start + l, length - l) 58 | if not len(tail): 59 | continue 60 | result = [l] + tail 61 | break 62 | 63 | return result 64 | 65 | # start at the leafs 66 | for p in range(0, len_tokens): 67 | tokenset = set(tokens[p]) 68 | P[p] = {1: defaultdict(set)} 69 | while len(tokenset): 70 | new_tokenset = set() 71 | for token in tokenset: 72 | for n in self.rindex[token]: 73 | rule = self.rules[n] 74 | P[p][1][rule[0]].add(n) 75 | new_tokenset.add(rule[0]) 76 | 77 | tokenset = new_tokenset 78 | 79 | for l in range(2, len_tokens + 1): # length 80 | for p in range(0, len_tokens): # position 81 | P[p][l] = defaultdict(set) 82 | for n in range(0, len(self.rules)): 83 | rule = self.rules[n] 84 | matching = match(rule[1:], p, l) 85 | if matching: 86 | P[p][l][rule[0]].add((n, tuple(matching))) 87 | 88 | self.P = P 89 | self.tokens = tokens 90 | 91 | return len(P[0][len_tokens]) 92 | 93 | def build_tree(self): 94 | def build(head, start, length): 95 | for matching in self.P[start][length][head]: 96 | if type(matching) == tuple: 97 | rule_n, lengths = matching 98 | rule = self.rules[rule_n] 99 | root = [head] 100 | start = start 101 | for i in range(0, len(rule) - 1): 102 | root.append(build(rule[i + 1], start, lengths[i])) 103 | start += lengths[i] 104 | else: 105 | rule = self.rules[matching] 106 | if rule[1] in self.literals: 107 | root = [head, rule[1]] 108 | else: 109 | root = [head, build(rule[1], start, length)] 110 | 111 | return tuple(root) 112 | 113 | return build('EX', 0, len(self.tokens)) 114 | 115 | def print_tree(self, tree, padding = '', pad_with = '\t'): 116 | if len(tree) == 2 and tree[1] in self.literals: 117 | print('{0}({1} "{2}")'.format(padding, tree[0], tree[1])) 118 | else: 119 | print('{0}({1}'.format(padding, tree[0])) 120 | for branch in tree[1:]: 121 | self.print_tree(branch, padding + pad_with, pad_with) 122 | print('{0})'.format(padding)) -------------------------------------------------------------------------------- /src/pos.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 4, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | import config 7 | import liblinearutil as svm 8 | 9 | tagset = ['S', 'A', 'NUM', 'A-NUM', 'V', 'ADV', 'PRAEDIC', 'PARENTH', 'S-PRO', 'A-PRO', 'ADV-PRO', 'PRAEDIC-PRO', 'PR', 'CONJ', 'PART', 'INTJ', 'INIT', 'NONLEX'] 10 | tag_id = {} 11 | tag_inv = {} 12 | for i in range(0, len(tagset)): 13 | tag_id[tagset[i]] = i + 1 14 | tag_inv[i + 1] = tagset[i] 15 | 16 | class Tagger: 17 | def __init__(self): 18 | self.chain_len = 3 19 | self._features = TaggerFeatures() 20 | pass 21 | 22 | def load(self, modelname, featuresname): 23 | self._svm_model = svm.load_model(modelname) 24 | self._features.load(open(featuresname, 'rb')) 25 | 26 | def save(self, modelname, featuresname): 27 | svm.save_model(modelname, self._svm_model) 28 | self._features.save(open(featuresname, 'wb')) 29 | 30 | def get_label_id(self, pos): 31 | return tag_id[pos] if pos in tag_id else 0 32 | 33 | def get_label(self, id): 34 | return tag_inv[id] if id in tag_inv else '?' 35 | 36 | def train(self, sentences, labels, cross_validation = False): 37 | x = [] 38 | y = [] 39 | 40 | for i in range(0, len(sentences)): 41 | sentence = sentences[i] 42 | prev = [] 43 | 44 | j = 0 45 | for word in sentence: 46 | body = word.lower() 47 | 48 | featurespace = self._construct_featurespace(body, prev) 49 | 50 | prev.append((body, labels[i][j])) 51 | if len(prev) > self.chain_len: 52 | del(prev[0]) 53 | 54 | x.append(featurespace.featureset) 55 | j += 1 56 | 57 | y.extend(labels[i]) 58 | 59 | prob = svm.problem(y, x) 60 | 61 | if cross_validation: 62 | param = svm.parameter('-c 1 -v 4 -s 4') 63 | svm.train(prob, param) 64 | else: 65 | param = svm.parameter('-c 1 -s 4') 66 | self._svm_model = svm.train(prob, param) 67 | 68 | def label(self, sentence): 69 | labeled = [] 70 | prev = [] 71 | for word in sentence: 72 | body = word.lower() 73 | 74 | featurespace = self._construct_featurespace(body, prev) 75 | 76 | p_label, _, _ = svm.predict([0], [featurespace.featureset], self._svm_model, '') 77 | label = p_label[0] 78 | 79 | prev.append((body, label)) 80 | if len(prev) > self.chain_len: 81 | del(prev[0]) 82 | 83 | labeled.append((word, label)) 84 | 85 | return labeled 86 | 87 | def _construct_featurespace(self, word, prev): 88 | featurespace = ml.FeatureSpace() 89 | 90 | featurespace.add({1: len(word)}, 10) 91 | featurespace.add(self._features.from_suffix(word)) 92 | featurespace.add(self._features.from_prefix(word)) 93 | featurespace.add(self._features.from_body(word)) 94 | 95 | for item in prev: 96 | featurespace.add({1: item[1]}, 100) 97 | # featurespace.add(features.from_suffix(item[0])) 98 | # featurespace.add(features.from_prefix(item[0])) 99 | # featurespace.add(features.from_body(item[0])) 100 | 101 | return featurespace 102 | 103 | 104 | import pickle 105 | import ml 106 | class TaggerFeatures: 107 | def __init__(self): 108 | self._body_id = {} 109 | self._suffix_id = {} 110 | self._prefix_id = {} 111 | 112 | self._train = True 113 | self._featurespace = ml.FeatureSpace() 114 | 115 | def load(self, fp): 116 | (self._body_id, self._suffix_id, self._prefix_id) = pickle.load(fp) 117 | self._train = False 118 | 119 | def save(self, fp): 120 | pickle.dump((self._body_id, self._suffix_id, self._prefix_id), fp) 121 | 122 | def from_body(self, body): 123 | featureset = {} 124 | if self._train: 125 | if body not in self._body_id: 126 | self._body_id[body] = len(self._body_id) + 1 127 | 128 | featureset[self._body_id[body]] = 1 129 | else: 130 | if body in self._body_id: 131 | featureset[self._body_id[body]] = 1 132 | 133 | return featureset 134 | 135 | def from_suffix(self, body): 136 | featureset = {} 137 | 138 | suffix2 = body[-2:] 139 | if suffix2 not in self._suffix_id: 140 | self._suffix_id[suffix2] = len(self._suffix_id) + 1 141 | featureset[self._suffix_id[suffix2]] = 1 142 | 143 | suffix3 = body[-3:] 144 | if suffix3 not in self._suffix_id: 145 | self._suffix_id[suffix3] = len(self._suffix_id) + 1 146 | featureset[self._suffix_id[suffix3]] = 1 147 | 148 | return featureset 149 | 150 | def from_prefix(self, body): 151 | featureset = {} 152 | 153 | prefix2 = body[:2] 154 | if prefix2 not in self._prefix_id: 155 | self._prefix_id[prefix2] = len(self._prefix_id) + 1 156 | featureset[self._prefix_id[prefix2]] = 1 157 | 158 | prefix3 = body[:3] 159 | if prefix3 not in self._prefix_id: 160 | self._prefix_id[prefix3] = len(self._prefix_id) + 1 161 | featureset[self._prefix_id[prefix3]] = 1 162 | 163 | return featureset -------------------------------------------------------------------------------- /src/rnc.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 5, 2011 3 | 4 | @author: alexpak 5 | ''' 6 | import xml.parsers.expat 7 | 8 | class Reader: 9 | def __init__(self): 10 | self._parser = xml.parsers.expat.ParserCreate() 11 | self._parser.StartElementHandler = self.start_element 12 | self._parser.EndElementHandler = self.end_element 13 | self._parser.CharacterDataHandler = self.char_data 14 | 15 | def start_element(self, name, attr): 16 | if name == 'ana': 17 | self._info = attr 18 | 19 | def end_element(self, name): 20 | if name == 'se': 21 | self._sentences.append(self._sentence) 22 | self._sentence = [] 23 | elif name == 'w': 24 | self._sentence.append((self._cdata, self._info)) 25 | elif name == 'ana': 26 | self._cdata = '' 27 | 28 | def char_data(self, content): 29 | self._cdata += content 30 | 31 | def read(self, filename): 32 | f = open(filename) 33 | content = f.read() 34 | f.close() 35 | 36 | self._sentences = [] 37 | self._sentence = [] 38 | self._cdata = '' 39 | self._info = '' 40 | 41 | self._parser.Parse(content) 42 | 43 | return self._sentences -------------------------------------------------------------------------------- /src/sentiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irokez/Pyrus/9a5c64991592ca13e0a5726f394fbe2a399501c3/src/sentiment/__init__.py -------------------------------------------------------------------------------- /src/sentiment/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Sentiment analysis demo - Pyrus 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

Pyrus

14 |
15 |
16 |
17 |

Sentiment anlysis demo

18 | {if len(q) and not len(error)} 19 |
20 |

Movie: "{$q}"

21 | 22 | 23 | {for cl, text in output} 24 | 25 | 26 | 27 | 28 | {end} 29 | 30 |
{$cl}{$text}
31 |

Classified {$msgs} messages in {$time_total} sec, {$msgs_per_sec} msgs per sec

32 |
33 | {else} 34 |
35 |

Enter a movie name and press "Search"

36 | {if len(error)}

{$error}

{end} 37 |

For example: темный рыцарь

38 |
39 | {end} 40 |
41 |
42 |
43 |
44 | 49 |
50 |
51 |
52 |
53 |

© 2011–2012 Pyrus

54 |
55 |
56 | 57 | 58 | -------------------------------------------------------------------------------- /src/sentiment/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | Created on Aug 7, 2011 5 | 6 | @author: alexpak 7 | ''' 8 | 9 | import cherrypy 10 | import sys 11 | import time 12 | import cgi 13 | import os 14 | import requests 15 | import json 16 | 17 | path = os.path.dirname(os.path.abspath(__file__)) + '/' 18 | 19 | f = open(path + 'demo.html', 'rb') 20 | content = f.read().decode() 21 | f.close() 22 | 23 | from pyrus.src import template 24 | from yatk import ir 25 | from yatk.ml.svm import SVM as Classifier 26 | from red import pie 27 | 28 | TTL = 60 29 | 30 | r = pie.Redis() 31 | cl = Classifier.load('test.svm') 32 | index = ir.SentimentIndex.load('test.index', 'delta', 'bogram') 33 | index.get_text = lambda x: x['text'] 34 | 35 | class HelloWorld: 36 | @cherrypy.expose 37 | def index(self, q = ''): 38 | start = time.time() 39 | q = q.strip() 40 | error = '' 41 | 42 | T = template.Template() 43 | 44 | if len(q): 45 | cached = r.get(q) 46 | 47 | if not cached: 48 | url = 'http://search.twitter.com/search.json' 49 | req = requests.get(url, params={'q': q}) 50 | data = json.loads(req.text) 51 | 52 | if 'results' not in data: 53 | print('Error') 54 | print(data) 55 | exit() 56 | 57 | cached = json.dumps(data['results']) 58 | r.setex(q, TTL, cached) 59 | 60 | results = json.loads(cached) 61 | 62 | docs = [] 63 | 64 | for msg in results: 65 | feats = index.weight(index.features(msg)) 66 | docs.append(feats) 67 | 68 | labels = cl.predict(docs) 69 | output = [] 70 | for n in range(len(results)): 71 | output.append((labels[n], results[n]['text'])) 72 | 73 | end = time.time() 74 | 75 | T.q = cgi.escape(q) 76 | T.output = output 77 | T.time_total = round(end - start, 1) 78 | T.msgs = len(output) 79 | T.msgs_per_sec = round(len(output) / (end - start), 1) 80 | 81 | T.error = error 82 | 83 | 84 | return T.transform(content) 85 | 86 | @cherrypy.expose 87 | def test(self): 88 | return content 89 | 90 | cherrypy.server.socket_host = '0.0.0.0' 91 | config = { 92 | '/': { 93 | 'tools.staticdir.on': True, 94 | 'tools.staticdir.dir': path + 'public/', 95 | 'tools.encode.encoding': 'utf8' 96 | } 97 | } 98 | 99 | cherrypy.quickstart(HelloWorld(), config = config) 100 | -------------------------------------------------------------------------------- /src/sentiment/download-kinopoisk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import requests 4 | import time 5 | import sqlite3 6 | from bs4 import BeautifulSoup as soup 7 | 8 | def download(cl, limit): 9 | url = 'http://www.kinopoisk.ru/review/type/comment/status/{0}/period/year/perpage/100/page/{1}/' 10 | 11 | texts = [] 12 | p = 1 13 | 14 | while True: 15 | r = requests.get(url.format(cl, p)) 16 | s = soup(r.text) 17 | for div in s.find_all('div', {'class': 'userReview'}): 18 | div_resp = div.find('div', {'class': 'response'}) 19 | div_text = div.find('div', {'class': 'brand_words'}) 20 | 21 | texts.append((div_text.text,)) 22 | 23 | print('Processed page {0}, {1} texts'.format(p, len(texts))) 24 | if len(texts) >= limit: 25 | break 26 | 27 | p += 1 28 | time.sleep(1) 29 | 30 | return texts[:limit] 31 | 32 | con = sqlite3.connect('test.db') 33 | cur = con.cursor() 34 | 35 | cur.execute(''' 36 | create table docs( 37 | id integer primary key autoincrement, 38 | text text, 39 | class text 40 | ) 41 | ''') 42 | 43 | limit = 500 44 | 45 | texts_pos = download('good', limit) 46 | texts_neg = download('bad', limit) 47 | 48 | cur.executemany('insert into docs (class, text) values ("pos", ?)', texts_pos) 49 | cur.executemany('insert into docs (class, text) values ("neg", ?)', texts_neg) 50 | 51 | con.commit() 52 | con.close() -------------------------------------------------------------------------------- /src/sentiment/index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sqlite3 4 | import re 5 | from yatk import ir 6 | from collections import defaultdict 7 | 8 | con = sqlite3.connect('test.db') 9 | con.row_factory = sqlite3.Row 10 | cur = con.cursor() 11 | 12 | cur.execute(''' 13 | create table if not exists ngrams ( 14 | id integer primary key autoincrement, 15 | body text, 16 | n_pos integer, 17 | n_neg integer 18 | ) 19 | ''') 20 | 21 | count = defaultdict(lambda: {'pos': 0, 'neg': 0}) 22 | 23 | cur.execute('select class, text from docs') 24 | for row in cur.fetchall(): 25 | words = ir.tokenize(row['text'].lower()) 26 | ngrams = set(words + ir.ngrams(words, 2)) 27 | for ngram in ngrams: 28 | count[ngram]['pos'] += row['class'] == 'pos' 29 | count[ngram]['neg'] += row['class'] == 'neg' 30 | 31 | for ngram_id, ngram_count in count.items(): 32 | cur.execute('insert into ngrams (body, n_pos, n_neg) values (?, ?, ?)', (ngram_id, ngram_count['pos'], ngram_count['neg'])) 33 | 34 | cur.execute('create index ngrams_body on ngrams(body)') 35 | 36 | con.commit() 37 | con.close() -------------------------------------------------------------------------------- /src/sentiment/public/main.css: -------------------------------------------------------------------------------- 1 | @import "reset5.css"; 2 | 3 | body { 4 | font-family: sans-serif; 5 | } 6 | 7 | #wrapper { 8 | padding: 20px; 9 | } 10 | 11 | header { 12 | padding: 20px 0; 13 | } 14 | 15 | header h1 { 16 | font-size: 32px; 17 | font-weight: normal; 18 | } 19 | 20 | article { 21 | 22 | } 23 | 24 | article h1 { 25 | font-size: 18px; 26 | font-weight: normal; 27 | } 28 | 29 | #result { 30 | padding: 20px; 31 | background: #eee; 32 | margin-top: 16px; 33 | line-height: 24px; 34 | } 35 | 36 | #result table { 37 | border-collapse: collapse; 38 | margin: 16px 0; 39 | } 40 | 41 | #result .pos { 42 | background: PaleGreen; 43 | } 44 | #result .neg { 45 | background: LightSalmon; 46 | } 47 | 48 | #result th { 49 | padding: 0px 20px 0 0; 50 | text-align: left; 51 | font-size: 14px; 52 | font-weight: normal; 53 | } 54 | #result td { 55 | padding: 0px 0px 0 0; 56 | font-style: italic; 57 | font-size: 14px; 58 | color: #333; 59 | } 60 | 61 | #description { 62 | padding: 20px; 63 | background: #eee; 64 | margin-top: 16px; 65 | line-height: 24px; 66 | } 67 | 68 | form { 69 | padding: 20px 0; 70 | position: relative; 71 | } 72 | 73 | div#text { 74 | padding-right: 240px; 75 | margin-bottom: 16px; 76 | } 77 | 78 | input { 79 | font-family: serif; 80 | font-size: 24px; 81 | padding: 10px; 82 | width: 100%; 83 | border: 1px inset #999; 84 | } 85 | 86 | div#submit { 87 | position: absolute; 88 | top: 19px; 89 | right: 0px; 90 | text-align: center; 91 | } 92 | 93 | button { 94 | font-size: 24px; 95 | padding: 10px; 96 | text-align: center; 97 | width: 200px; 98 | } 99 | 100 | footer { 101 | padding: 20px 0; 102 | } 103 | -------------------------------------------------------------------------------- /src/sentiment/public/reset5.css: -------------------------------------------------------------------------------- 1 | /* 2 | html5doctor.com Reset Stylesheet 3 | v1.6.1 4 | Last Updated: 2010-09-17 5 | Author: Richard Clark - http://richclarkdesign.com 6 | Twitter: @rich_clark 7 | */ 8 | 9 | html, body, div, span, object, iframe, 10 | h1, h2, h3, h4, h5, h6, p, blockquote, pre, 11 | abbr, address, cite, code, 12 | del, dfn, em, img, ins, kbd, q, samp, 13 | small, strong, sub, sup, var, 14 | b, i, 15 | dl, dt, dd, ol, ul, li, 16 | fieldset, form, label, legend, 17 | table, caption, tbody, tfoot, thead, tr, th, td, 18 | article, aside, canvas, details, figcaption, figure, 19 | footer, header, hgroup, menu, nav, section, summary, 20 | time, mark, audio, video { 21 | margin:0; 22 | padding:0; 23 | border:0; 24 | outline:0; 25 | font-size:100%; 26 | vertical-align:baseline; 27 | background:transparent; 28 | } 29 | 30 | body { 31 | line-height:1; 32 | } 33 | 34 | article,aside,details,figcaption,figure, 35 | footer,header,hgroup,menu,nav,section { 36 | display:block; 37 | } 38 | 39 | nav ul { 40 | list-style:none; 41 | } 42 | 43 | blockquote, q { 44 | quotes:none; 45 | } 46 | 47 | blockquote:before, blockquote:after, 48 | q:before, q:after { 49 | content:''; 50 | content:none; 51 | } 52 | 53 | a { 54 | margin:0; 55 | padding:0; 56 | font-size:100%; 57 | vertical-align:baseline; 58 | background:transparent; 59 | } 60 | 61 | /* change colours to suit your needs */ 62 | ins { 63 | background-color:#ff9; 64 | color:#000; 65 | text-decoration:none; 66 | } 67 | 68 | /* change colours to suit your needs */ 69 | mark { 70 | background-color:#ff9; 71 | color:#000; 72 | font-style:italic; 73 | font-weight:bold; 74 | } 75 | 76 | del { 77 | text-decoration: line-through; 78 | } 79 | 80 | abbr[title], dfn[title] { 81 | border-bottom:1px dotted; 82 | cursor:help; 83 | } 84 | 85 | table { 86 | border-collapse:collapse; 87 | border-spacing:0; 88 | } 89 | 90 | /* change border colour to suit your needs */ 91 | hr { 92 | display:block; 93 | height:1px; 94 | border:0; 95 | border-top:1px solid #cccccc; 96 | margin:1em 0; 97 | padding:0; 98 | } 99 | 100 | input, select { 101 | vertical-align:middle; 102 | } -------------------------------------------------------------------------------- /src/sentiment/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import requests 5 | import json 6 | from yatk import ir 7 | from yatk.ml.svm import SVM as Classifier 8 | # from yatk.ml.nb import NaiveBayes as Classifier 9 | from red import pie 10 | 11 | if len(sys.argv) < 2: 12 | print('Enter query') 13 | exit() 14 | 15 | TTL = 60 16 | 17 | r = pie.Redis() 18 | q = sys.argv[1] 19 | cached = r.get(q) 20 | 21 | if not cached: 22 | url = 'http://search.twitter.com/search.json' 23 | req = requests.get(url, params={'q': sys.argv[1]}) 24 | data = json.loads(req.text) 25 | 26 | if 'results' not in data: 27 | print('Error') 28 | print(data) 29 | exit() 30 | 31 | cached = json.dumps(data['results']) 32 | r.setex(q, TTL, cached) 33 | 34 | results = json.loads(cached) 35 | 36 | cl = Classifier.load('test.svm') 37 | index = ir.SentimentIndex.load('test.index', 'delta', 'bogram') 38 | index.get_text = lambda x: x['text'] 39 | 40 | docs = [] 41 | 42 | for msg in results: 43 | feats = index.weight(index.features(msg)) 44 | docs.append(feats) 45 | 46 | labels = cl.predict(docs) 47 | for n in range(len(results)): 48 | print('{0}.\t{1}\t{2}'.format(n + 1, labels[n], results[n]['text'])) -------------------------------------------------------------------------------- /src/sentiment/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sqlite3 4 | from yatk import ir 5 | from yatk.ml.svm import SVM as Classifier 6 | # from yatk.ml.nb import NaiveBayes as Classifier 7 | 8 | con = sqlite3.connect('test.db') 9 | con.row_factory = sqlite3.Row 10 | cur = con.cursor() 11 | 12 | docs = [] 13 | cur.execute('select class, text from docs') 14 | for row in cur.fetchall(): 15 | docs.append((row['class'], row['text'])) 16 | 17 | index = ir.SentimentIndex('delta', 'bogram') 18 | index.get_class = lambda x: x[0] 19 | index.get_text = lambda x: x[1] 20 | index.build(docs) 21 | 22 | x = [] 23 | y = [] 24 | for doc in docs: 25 | x.append(index.weight(index.features(doc))) 26 | y.append(doc[0]) 27 | 28 | cl = Classifier() 29 | cl.train(x, y) 30 | cl.save('test.svm') 31 | 32 | index.save('test.index') 33 | 34 | con.close() -------------------------------------------------------------------------------- /src/sentiment/validate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sqlite3 4 | from yatk import ir 5 | from yatk import ml 6 | from yatk.ml.svm import SVM 7 | from yatk.ml.nb import NaiveBayes 8 | 9 | con = sqlite3.connect('test.db') 10 | con.row_factory = sqlite3.Row 11 | cur = con.cursor() 12 | 13 | docs = [] 14 | cur.execute('select class, text from docs') 15 | for row in cur.fetchall(): 16 | docs.append((row['class'], row['text'])) 17 | 18 | con.close() 19 | 20 | docs_even = [] 21 | N = int(len(docs) / 2) 22 | for i in range(N): 23 | docs_even.append(docs[i]) 24 | docs_even.append(docs[N + i]) 25 | 26 | def test(classifier, features, weight): 27 | p = [] 28 | for fold in range(1, 6): 29 | train_docs, test_docs = ml.folds(docs_even, 5, fold) 30 | 31 | index = ir.SentimentIndex(weight, features) 32 | index.get_class = lambda x: x[0] 33 | index.get_text = lambda x: x[1] 34 | index.build(train_docs) 35 | 36 | train_x = [] 37 | train_y = [] 38 | for doc in train_docs: 39 | train_x.append(index.weight(index.features(doc))) 40 | train_y.append(doc[0]) 41 | 42 | test_x = [] 43 | test_y = [] 44 | for doc in test_docs: 45 | test_x.append(index.weight(index.features(doc))) 46 | test_y.append(doc[0]) 47 | 48 | 49 | cl = classifier() 50 | cl.train(train_x, train_y) 51 | labels = cl.predict(test_x) 52 | mic, mac = cl.evaluate(test_y, labels) 53 | p.append(mic) 54 | 55 | print('{0} {1} {2}: {3:.1f}%'.format(classifier, features, weight, ir.avg(p) * 100)) 56 | 57 | test(NaiveBayes, 'unigram', 'bin') 58 | test(NaiveBayes, 'bigram', 'bin') 59 | test(NaiveBayes, 'bogram', 'bin') 60 | test(SVM, 'unigram', 'bin') 61 | test(SVM, 'bigram', 'bin') 62 | test(SVM, 'bogram', 'bin') 63 | test(SVM, 'unigram', 'delta') 64 | test(SVM, 'bigram', 'delta') 65 | test(SVM, 'bogram', 'delta') -------------------------------------------------------------------------------- /src/syntagrus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Nov 21, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | import os 8 | import sqlite3 9 | import xml.parsers.expat 10 | import glob 11 | from optparse import OptionParser 12 | from collections import namedtuple 13 | 14 | word_t = namedtuple('word_t', ['lemma', 'pos', 'feat', 'id', 'dom', 'link']) 15 | feat_ru_en = { 16 | 'ЕД': 'sg', 17 | 'МН': 'pl', 18 | 'ЖЕН': 'f', 19 | 'МУЖ': 'm', 20 | 'СРЕД': 'n', 21 | 'ИМ': 'nom', 22 | 'РОД': 'gen', 23 | 'ДАТ': 'dat', 24 | 'ВИН': 'acc', 25 | 'ТВОР': 'ins', 26 | 'ПР': 'prep', 27 | 'ПАРТ': 'gen2', 28 | 'МЕСТН': 'loc', 29 | 'ОД': 'anim', 30 | 'НЕОД': 'inan', 31 | 'ИНФ': 'inf', 32 | 'ПРИЧ': 'adjp', 33 | 'ДЕЕПР': 'advp', 34 | 'ПРОШ': 'pst', 35 | 'НЕПРОШ': 'npst', 36 | 'НАСТ': 'prs', 37 | '1-Л': '1p', 38 | '2-Л': '2p', 39 | '3-Л': '3p', 40 | 'ИЗЪЯВ': 'real', 41 | 'ПОВ': 'imp', 42 | 'КР': 'shrt', 43 | 'НЕСОВ': 'imperf', 44 | 'СОВ': 'perf', 45 | 'СТРАД': 'pass', 46 | 'СЛ': 'compl', 47 | 'СМЯГ': 'soft', 48 | 'СРАВ': 'comp', 49 | 'ПРЕВ': 'supl', 50 | } 51 | 52 | link_ru_en = { 53 | 'предик': 'subj', 54 | '1-компл': 'obj', 55 | '2-компл': 'obj', 56 | '3-компл': 'obj', 57 | '4-компл': 'obj', 58 | '5-компл': 'obj', 59 | 'опред': 'amod', 60 | 'предл': 'prep', 61 | 'обст': 'pobj', 62 | } 63 | { 64 | 'огранич': '', 65 | 'квазиагент': '', 66 | 'сочин': '', 67 | 'соч-союзн': '', 68 | 'атриб': '', 69 | 'аппоз': '', 70 | 'подч-союзн': '', 71 | 'вводн': '', 72 | 'сент-соч': '', 73 | 'количест': '', 74 | 'разъяснит': '', 75 | 'присвяз': '', 76 | 'релят': '', 77 | 'сравн-союзн': '', 78 | 'примыкат': '', 79 | 'сравнит': '', 80 | 'соотнос': '', 81 | 'эксплет': '', 82 | 'аналит': '', 83 | 'пасс-анал': '', 84 | 'вспом': '', 85 | 'агент': '', 86 | 'кратн': '', 87 | 'инф-союзн': '', 88 | 'электив': '', 89 | 'композ': '', 90 | 'колич-огран': '', 91 | 'неакт-компл': '', 92 | 'пролепт': '', 93 | 'суб-копр': '', 94 | 'дат-субъект': '', 95 | 'длительн': '', 96 | 'об-аппоз': '', 97 | 'изъясн': '', 98 | 'компл-аппоз': '', 99 | 'оп-опред': '', 100 | '1-несобст-компл': '', 101 | 'распред': '', 102 | 'уточн': '', 103 | 'нум-аппоз': '', 104 | 'ном-аппоз': '', 105 | '2-несобст-компл': '', 106 | 'аппрокс-колич': '', 107 | 'колич-вспом': '', 108 | 'колич-копред': '', 109 | 'кратно-длительн': '', 110 | 'об-копр': '', 111 | 'эллипт': '', 112 | '3-несобст-компл': '', 113 | '4-несобст-компл': '', 114 | 'fictit': '', 115 | 'авт-аппоз': '', 116 | 'аддит': '', 117 | 'адр-присв': '', 118 | 'дистанц': '', 119 | 'несобст-агент': '', 120 | 'об-обст': '', 121 | 'обст-тавт': '', 122 | 'презентат': '', 123 | 'сент-предик': '', 124 | 'суб-обст': '', 125 | } 126 | 127 | class Reader: 128 | def __init__(self): 129 | self._parser = xml.parsers.expat.ParserCreate() 130 | self._parser.StartElementHandler = self.start_element 131 | self._parser.EndElementHandler = self.end_element 132 | self._parser.CharacterDataHandler = self.char_data 133 | 134 | def start_element(self, name, attr): 135 | if name == 'W': 136 | features = attr['FEAT'].split(' ') if 'FEAT' in attr else ['UNK'] 137 | for i in range(0, len(features)): 138 | if features[i] in feat_ru_en: 139 | features[i] = feat_ru_en[features[i]] 140 | 141 | lemma = lemma=attr['LEMMA'].lower() if 'LEMMA' in attr else '' 142 | link = attr['LINK'] if 'LINK' in attr else None 143 | # if link in link_ru_en: 144 | # link = link_ru_en[link] 145 | 146 | dom = int(attr['DOM']) if attr['DOM'] != '_root' else 0 147 | pos = features[0] 148 | feat = set(features[1:]) 149 | 150 | if 'adjp' in feat: 151 | pos = 'VADJ' 152 | feat -= {'adjp'} 153 | 154 | if 'advp' in feat: 155 | pos = 'VADV' 156 | feat -= {'advp'} 157 | 158 | if 'inf' in feat: 159 | pos = 'VINF' 160 | feat -= {'inf'} 161 | 162 | self._info = word_t(lemma=lemma, pos=pos, feat=feat, id=int(attr['ID']), dom=dom, link=link) 163 | self._cdata = '' 164 | 165 | def end_element(self, name): 166 | if name == 'S': 167 | self._sentences.append(self._sentence) 168 | self._sentence = [] 169 | elif name == 'W': 170 | self._sentence.append((self._cdata, self._info)) 171 | self._cdata = '' 172 | 173 | def char_data(self, content): 174 | self._cdata += content 175 | 176 | def read(self, filename): 177 | f = open(filename, encoding='windows-1251') 178 | content = f.read() 179 | f.close() 180 | content = content.replace('encoding="windows-1251"', 'encoding="utf-8"') 181 | 182 | self._sentences = [] 183 | self._sentence = [] 184 | self._cdata = '' 185 | self._info = '' 186 | 187 | self._parser.Parse(content) 188 | 189 | return self._sentences 190 | 191 | class Lexicon: 192 | def __init__(self, dbname): 193 | self.dbname = dbname 194 | db_exists = os.path.isfile(dbname) 195 | self.con = sqlite3.connect(dbname) 196 | self.cur = self.con.cursor() 197 | 198 | if not db_exists: 199 | self.create_db() 200 | 201 | def create_db(self): 202 | sql = ''' 203 | create table words( 204 | id integer primary key autoincrement, 205 | lemma text, 206 | pos text, 207 | form text, 208 | info text, 209 | freq integer 210 | ); 211 | create index words_lemma_form_info on words(lemma, form, info); 212 | ''' 213 | [self.cur.execute(st) for st in sql.split(';') if len(st.strip())] 214 | 215 | def index(self, filename): 216 | sentences = Reader().read(filename) 217 | for sentence in sentences: 218 | for word in sentence: 219 | feat = ' '.join(word[1].feat) 220 | self.cur.execute('select id from words where lemma = ? and form = ? and pos = ? and info = ?', (word[1].lemma, word[0], word[1].pos, feat)) 221 | row = self.cur.fetchone() 222 | if row is None: 223 | self.cur.execute('insert into words (lemma, pos, form, info, freq) values (?, ?, ?, ?, 1)', (word[1].lemma, word[1].pos, word[0], feat)) 224 | else: 225 | self.cur.execute('update words set freq = freq + 1 where id = ?', row) 226 | 227 | def close(self): 228 | self.con.commit() 229 | self.con.close() 230 | 231 | if __name__ == '__main__': 232 | parser = OptionParser() 233 | parser.usage = '%prog [options] inputfile' 234 | parser.add_option('-L', '--construct-lexicon', action = 'store_const', const = True , dest = 'lexicon', help = 'construct lexicon') 235 | 236 | (options, args) = parser.parse_args() 237 | 238 | if options.lexicon: 239 | L = Lexicon('tmp/lexicon') 240 | files = glob.glob('res/*/*/*.tgt') 241 | for file in files: 242 | L.index(file) 243 | 244 | L.close() 245 | 246 | #R = Reader() 247 | #sentences = R.read(args[0]) 248 | #print(len(sentences)) 249 | #print(sentences[0]) 250 | -------------------------------------------------------------------------------- /src/template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 7, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | def print_stack(stack, padding = '\n', pad_with = '\t'): 8 | s = '' 9 | for expr in stack: 10 | if isinstance(expr, str): 11 | s += padding + '__s__ +="""' + expr.replace('"', '\\"') + '"""' 12 | else: 13 | if expr[0] == 'if': 14 | s += padding + 'if ' + expr[1] + ':' 15 | s += print_stack(expr[2], padding + pad_with, pad_with) or padding + pad_with + 'pass' 16 | s += padding + 'else:' 17 | s += print_stack(expr[3], padding + pad_with, pad_with) or padding + pad_with + 'pass' 18 | elif expr[0] == 'for': 19 | s += padding + 'for ' + expr[1] + ':' 20 | s += print_stack(expr[2], padding + pad_with, pad_with) or padding + pad_with + 'pass' 21 | elif expr[0] == 'print': 22 | s += padding + '__s__+=str(' + expr[1] + ')' 23 | return s 24 | 25 | class Template: 26 | vars = {} 27 | 28 | def assign(self, key, val): 29 | self.vars[key] = val 30 | 31 | def __setattr__(self, key, val): 32 | self.assign(key, val) 33 | 34 | def __setitem__(self, key, val): 35 | self.assign(key, val) 36 | 37 | def transform(self, template): 38 | buffer = '' 39 | 40 | stack = [] 41 | current_stack = stack 42 | stack_chain = [] 43 | stack_chain.append(current_stack) 44 | expr = tuple() 45 | last_if = tuple() 46 | open_bracket = False 47 | for ch in template: 48 | if ch == '{': 49 | if open_bracket: 50 | current_stack.append('{') 51 | 52 | open_bracket = True 53 | current_stack.append(buffer) 54 | buffer = '' 55 | elif ch == '}' and len(buffer): 56 | if buffer[0:3] == 'if ': 57 | expr = ('if', buffer[3:], [], []) 58 | current_stack.append(expr) 59 | stack_chain.append(current_stack) 60 | current_stack = expr[2] 61 | last_if = expr 62 | 63 | elif buffer == 'else': 64 | if last_if[0] != 'if': 65 | exit('Expected IF for ELSE') 66 | 67 | current_stack = last_if[3] 68 | 69 | elif buffer[0:4] == 'for ': 70 | expr = ('for', buffer[4:], []) 71 | current_stack.append(expr) 72 | stack_chain.append(current_stack) 73 | current_stack = expr[2] 74 | 75 | elif buffer[0] == '$': 76 | expr = ('print', buffer[1:]) 77 | current_stack.append(expr) 78 | 79 | elif buffer == 'end': 80 | current_stack = stack_chain.pop() 81 | 82 | else: 83 | if open_bracket: 84 | current_stack.append('{') 85 | 86 | current_stack.append(buffer + '}') 87 | 88 | open_bracket = False 89 | buffer = '' 90 | else: 91 | buffer += ch 92 | 93 | if buffer: 94 | if open_bracket: 95 | current_stack.append('{') 96 | current_stack.append(buffer) 97 | 98 | source = '__s__ = ""' + print_stack(stack) 99 | 100 | exec(source, self.vars) 101 | return self.vars['__s__'] 102 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 3, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | import sys 8 | import re 9 | 10 | import rnc 11 | import pos 12 | 13 | sentences = [] 14 | #sentences.extend(rnc.Reader().read('tmp/fiction.xml')) 15 | #sentences.extend(rnc.Reader().read('tmp/science.xml')) 16 | #sentences.extend(rnc.Reader().read('tmp/laws.xml')) 17 | sentences.extend(rnc.Reader().read('tmp/media1.xml')) 18 | sentences.extend(rnc.Reader().read('tmp/media2.xml')) 19 | sentences.extend(rnc.Reader().read('tmp/media3.xml')) 20 | 21 | re_pos = re.compile('([\w-]+)(?:[^\w-]|$)'.format('|'.join(pos.tagset))) 22 | 23 | tagger = pos.Tagger() 24 | 25 | sentence_labels = [] 26 | sentence_words = [] 27 | for sentence in sentences: 28 | labels = [] 29 | words = [] 30 | for word in sentence: 31 | gr = word[1]['gr'] 32 | m = re_pos.match(gr) 33 | if not m: 34 | print(gr, file = sys.stderr) 35 | 36 | pos = m.group(1) 37 | if pos == 'ANUM': 38 | pos = 'A-NUM' 39 | 40 | label = tagger.get_label_id(pos) 41 | if not label: 42 | print(gr, file = sys.stderr) 43 | 44 | labels.append(label) 45 | 46 | body = word[0].replace('`', '') 47 | words.append(body) 48 | 49 | sentence_labels.append(labels) 50 | sentence_words.append(words) 51 | 52 | tagger.train(sentence_words, sentence_labels, True) 53 | tagger.train(sentence_words, sentence_labels) 54 | tagger.save('tmp/svm.model', 'tmp/ids.pickle') -------------------------------------------------------------------------------- /test/pos-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 3, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | import sys 8 | import pos 9 | 10 | sentence = sys.argv[1].split(' ') 11 | 12 | tagger = pos.Tagger() 13 | tagger.load('tmp/svm.model', 'tmp/ids.pickle') 14 | 15 | rus = { 16 | 'S': 'сущ.', 17 | 'A': 'прил.', 18 | 'NUM': 'числ.', 19 | 'A-NUM': 'числ.-прил.', 20 | 'V': 'глаг.', 21 | 'ADV': 'нареч.', 22 | 'PRAEDIC': 'предикатив', 23 | 'PARENTH': 'вводное', 24 | 'S-PRO': 'местоим. сущ.', 25 | 'A-PRO': 'местоим. прил.', 26 | 'ADV-PRO': 'местоим. нареч.', 27 | 'PRAEDIC-PRO': 'местоим. предик.', 28 | 'PR': 'предлог', 29 | 'CONJ': 'союз', 30 | 'PART': 'частица', 31 | 'INTJ': 'межд.', 32 | 'INIT': 'инит', 33 | 'NONLEX': 'нонлекс' 34 | } 35 | 36 | tagged = [] 37 | for word, label in tagger.label(sentence): 38 | tagged.append((word, rus[tagger.get_label(label)])) 39 | 40 | print(tagged) -------------------------------------------------------------------------------- /test/test-alg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 19, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | import sys 9 | sys.path.append('src') 10 | 11 | from alg import Vector as V 12 | 13 | a = V([1, 2, 3, 4]) 14 | b = a + 3 15 | c = b + 4 16 | d = c - a 17 | f = V([1, 1, 1, 1]) 18 | f += a 19 | print(a) 20 | print(b) 21 | print(c) 22 | print(d) 23 | print(f) -------------------------------------------------------------------------------- /test/test-cyk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Created on Sep 6, 2011 5 | 6 | @author: alexpak 7 | ''' 8 | import sys 9 | sys.path.append('src') 10 | 11 | import parsers 12 | from parsers.cyk import CYK 13 | 14 | rules = ''' 15 | EX = B1 EX B2 | N 16 | B1 = "(" 17 | B2 = ")" 18 | EX = EX OP2 EX | OP1 EX 19 | N = "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "0" 20 | OP2 = "+" | "-" | "/" | "*" 21 | OP1 = "+" | "-" 22 | ''' 23 | 24 | #rules = ''' 25 | # EX = EX OP EX | N 26 | # N = "1" | "2" | "3" 27 | # OP = "+" | "-" 28 | #''' 29 | 30 | grammar = parsers.read_rules(rules) 31 | 32 | parser = CYK(grammar) 33 | #result = parser.parse('1 + 2 * ( 3 - 6 ) + 9 + 0 / 2') 34 | result = parser.parse('1 + 2 + 3') 35 | #parser.parse('1 + 2') 36 | tree = parser.build_tree() 37 | parser.print_tree(tree) 38 | 39 | ''' 40 | (EX (EX 1) 41 | (OP +) 42 | (N 2)) 43 | ''' -------------------------------------------------------------------------------- /test/test-iris.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 17, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | 8 | import sys 9 | sys.path.append('src') 10 | 11 | f = open('res/iris.data') 12 | data = []; x = []; y =[] 13 | for line in f: 14 | rows = line.strip().split(',') 15 | if len(rows) == 5: 16 | x.append([float(i) for i in rows[0:4]]) 17 | y.append(rows[4]) 18 | f.close() 19 | 20 | def list_to_dict(l): 21 | # return dict(zip(l, [1 for _ in range(0, len(l))])) 22 | return dict(zip(range(0, len(l)), l)) 23 | 24 | # divide into training and test sets 25 | test_x = []; test_y = []; train_x = []; train_y = []; 26 | for i in range(0, len(x)): 27 | if i % 3: 28 | train_x.append(list_to_dict(x[i])) 29 | train_y.append(y[i]) 30 | else: 31 | test_x.append(list_to_dict(x[i])) 32 | test_y.append(y[i]) 33 | 34 | from ml.nb import NaiveBayes 35 | 36 | classifier = NaiveBayes() 37 | classifier.train(train_x, train_y) 38 | estim_y = classifier.predict(test_x) 39 | (acc, ) = classifier.evaluate(test_y, estim_y) 40 | 41 | print('Naive Bayes accuracy = {0:.2f}%'.format(acc * 100)) 42 | 43 | from ml.svm import SVM 44 | 45 | classifier = SVM() 46 | classifier.train(train_x, train_y) 47 | estim_y = classifier.predict(test_x) 48 | (acc, ) = classifier.evaluate(test_y, estim_y) 49 | 50 | print('SVM accuracy = {0:.2f}%'.format(acc * 100)) -------------------------------------------------------------------------------- /test/test-names.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 17, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | import sys 8 | sys.path.append('src') 9 | 10 | f = open('res/names.txt') 11 | 12 | x = []; y = [] 13 | 14 | for line in f: 15 | (name, sex) = line.strip().split(' ') 16 | x.append({name[-1:]: 1, name[-2:]: 1, name[-3:]: 1, name[-4:]: 1}) 17 | y.append(sex) 18 | 19 | fold = -100 20 | 21 | f.close() 22 | 23 | import os 24 | 25 | from ml.nb import NaiveBayes 26 | 27 | filename = 'tmp/nb.pickle' 28 | exists = os.path.isfile(filename) 29 | if exists: 30 | classifier = NaiveBayes.load(filename) 31 | else: 32 | classifier = NaiveBayes() 33 | 34 | classifier.train(x[0:fold], y[0:fold]) 35 | estim_y = classifier.predict(x[fold:]) 36 | (acc, ) = classifier.evaluate(y[fold:], estim_y) 37 | 38 | if not exists: 39 | classifier.save(filename) 40 | 41 | print('Naive Bayes accuracy = {0:.2f}%'.format(acc * 100)) 42 | 43 | from ml.svm import SVM 44 | 45 | filename = 'tmp/svm.pickle' 46 | exists = os.path.isfile(filename) 47 | if exists: 48 | classifier = SVM.load(filename) 49 | else: 50 | classifier = SVM() 51 | 52 | classifier.train(x[0:fold], y[0:fold]) 53 | estim_y = classifier.predict(x[fold:]) 54 | (acc, ) = classifier.evaluate(y[fold:], estim_y) 55 | 56 | if not exists: 57 | classifier.save(filename) 58 | 59 | print('SVM accuracy = {0:.2f}%'.format(acc * 100)) -------------------------------------------------------------------------------- /test/test-nn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 18, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | import sys 8 | sys.path.append('src') 9 | 10 | def list_to_dict(l): 11 | return dict(zip(range(0, len(l)), l)) 12 | 13 | from ml.nn import Perceptron 14 | 15 | classifier = Perceptron(2) 16 | x = [ 17 | list_to_dict([0, 0]), 18 | list_to_dict([0, 1]), 19 | list_to_dict([1, 0]), 20 | list_to_dict([1, 1]) 21 | ] 22 | y = [0, 1, 1, 0] 23 | #y = [1] 24 | 25 | classifier.train(x, y) 26 | y = classifier.predict(x) 27 | print(y) -------------------------------------------------------------------------------- /test/test-polarity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Created on Aug 17, 2011 4 | 5 | @author: alexpak 6 | ''' 7 | import sys 8 | sys.path.append('src') 9 | 10 | from collections import defaultdict 11 | 12 | f_pos = open('res/rt-polaritydata/rt-polaritydata/rt-polarity.pos', 'rb') 13 | f_neg = open('res/rt-polaritydata/rt-polaritydata/rt-polarity.neg', 'rb') 14 | 15 | def count_words(line): 16 | count = defaultdict(bool) 17 | for word in line.strip().split(' '): 18 | if len(word) > 1: 19 | count[word] += 1 20 | 21 | return count 22 | 23 | x = []; y = [] 24 | 25 | i = 0 26 | eof = False 27 | while not eof: 28 | line = f_pos.readline() 29 | eof = not len(line) 30 | x.append(count_words(line.decode('utf-8', 'ignore'))) 31 | y.append(+1) 32 | x.append(count_words(f_neg.readline().decode('utf-8', 'ignore'))) 33 | y.append(-1) 34 | i += 1 35 | 36 | fold = int(i * 0.9) 37 | 38 | f_pos.close() 39 | f_neg.close() 40 | 41 | from ml.nb import NaiveBayes 42 | 43 | classifier = NaiveBayes() 44 | classifier.train(x[0:fold], y[0:fold]) 45 | estim_y = classifier.predict(x[fold:]) 46 | (acc, ) = classifier.evaluate(y[fold:], estim_y) 47 | 48 | print('Naive Bayes accuracy = {0:.2f}%'.format(acc * 100)) 49 | 50 | from ml.svm import SVM 51 | 52 | classifier = SVM() 53 | classifier.train(x[0:fold], y[0:fold]) 54 | estim_y = classifier.predict(x[fold:]) 55 | (acc, ) = classifier.evaluate(y[fold:], estim_y) 56 | 57 | print('SVM accuracy = {0:.2f}%'.format(acc * 100)) -------------------------------------------------------------------------------- /tmp/ids.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irokez/Pyrus/9a5c64991592ca13e0a5726f394fbe2a399501c3/tmp/ids.pickle -------------------------------------------------------------------------------- /web/.server.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irokez/Pyrus/9a5c64991592ca13e0a5726f394fbe2a399501c3/web/.server.py.swp -------------------------------------------------------------------------------- /web/html/.tagging.html.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irokez/Pyrus/9a5c64991592ca13e0a5726f394fbe2a399501c3/web/html/.tagging.html.swp -------------------------------------------------------------------------------- /web/html/tagging.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Parsing demo - Pyrus 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 |
19 |
20 |

Pyrus

21 |
22 |
23 |
24 |

Parsing demo

25 | {if len(text) and not len(error)} 26 |
27 |

{$text}

28 | 29 | 30 | {for word, pos in tagged} 31 | 32 | 33 | 34 | 35 | {end} 36 | 37 |
{$word}{$pos}
38 |
39 | 40 | 75 |

Parsed {$words} words in {$time_total} sec, {$words_per_sec} words per sec

76 |
77 | {else} 78 |
79 |

Enter the text and press "Tag"

80 | {if len(error)}

{$error}

{end} 81 |

For example: Съешьте еще этих мягких французских булок, да выпейте же чаю

82 |
83 | {end} 84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |

© 2011–2012 Pyrus

93 |
94 |
95 | 96 | 97 | -------------------------------------------------------------------------------- /web/public/.htaccess: -------------------------------------------------------------------------------- 1 | Options -Indexes 2 | 3 | 4 | Options +FollowSymLinks 5 | RewriteEngine On 6 | RewriteBase / 7 | 8 | RewriteCond %{REQUEST_FILENAME} !-f 9 | # RewriteCond %{REQUEST_FILENAME} !-d 10 | RewriteRule ^(.*)$ http://127.0.0.1:8080/$1 [P] 11 | 12 | 13 | 14 | 15 | Order allow,deny 16 | Deny from all 17 | 18 | 19 | AddDefaultCharset utf-8 20 | -------------------------------------------------------------------------------- /web/public/css/html5.js: -------------------------------------------------------------------------------- 1 | // html5shiv @rem remysharp.com/html5-enabling-script 2 | // iepp v1.6.2 @jon_neal iecss.com/print-protector 3 | // Dual licensed under the MIT or GPL Version 2 licenses 4 | /*@cc_on(function(a,b){function r(a){var b=-1;while(++b";return a.childNodes.length!==1}())){a.iepp=a.iepp||{};var c=a.iepp,d=c.html5elements||"abbr|article|aside|audio|canvas|datalist|details|figcaption|figure|footer|header|hgroup|mark|meter|nav|output|progress|section|summary|time|video",e=d.split("|"),f=e.length,g=new RegExp("(^|\\s)("+d+")","gi"),h=new RegExp("<(/*)("+d+")","gi"),i=/^\s*[\{\}]\s*$/,j=new RegExp("(^|[^\\n]*?\\s)("+d+")([^\\n]*)({[\\n\\w\\W]*?})","gi"),k=b.createDocumentFragment(),l=b.documentElement,m=l.firstChild,n=b.createElement("body"),o=b.createElement("style"),p=/print|all/,q;c.getCSS=function(a,b){if(a+""===undefined)return"";var d=-1,e=a.length,f,g=[];while(++d= fn.length ? fn.apply(self,args) : function(){ 18 | return master.apply( self, args.concat(curry.args(arguments)) ); 19 | }; 20 | }; 21 | }; 22 | 23 | curry.args = function( args ){ 24 | return Array.prototype.slice.call(args); 25 | }; 26 | 27 | Function.prototype.curry = function(){ 28 | return curry(this); 29 | }; -------------------------------------------------------------------------------- /web/public/js/d3-tree-test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by JetBrains RubyMine. 3 | * User: pavanpodila 4 | * Date: 7/17/11 5 | * Time: 4:30 PM 6 | * To change this template use File | Settings | File Templates. 7 | */ 8 | 9 | var treeData = { 10 | name: "/", 11 | contents: [ 12 | { 13 | name: "Applications", 14 | contents: [ 15 | { name: "Mail.app" }, 16 | { name: "iPhoto.app" }, 17 | { name: "Keynote.app" }, 18 | { name: "iTunes.app" }, 19 | { name: "XCode.app" }, 20 | { name: "Numbers.app" }, 21 | { name: "Pages.app" } 22 | ] 23 | }, 24 | { 25 | name: "System", 26 | contents: [] 27 | }, 28 | { 29 | name: "Library", 30 | contents: [ 31 | { 32 | name: "Application Support", 33 | contents: [ 34 | { name: "Adobe" }, 35 | { name: "Apple" }, 36 | { name: "Google" }, 37 | { name: "Microsoft" } 38 | ] 39 | }, 40 | { 41 | name: "Languages", 42 | contents: [ 43 | { name: "Ruby" }, 44 | { name: "Python" }, 45 | { name: "Javascript" }, 46 | { name: "C#" } 47 | ] 48 | }, 49 | { 50 | name: "Developer", 51 | contents: [ 52 | { name: "4.2" }, 53 | { name: "4.3" }, 54 | { name: "5.0" }, 55 | { name: "Documentation" } 56 | ] 57 | } 58 | ] 59 | }, 60 | { 61 | name: "opt", 62 | contents: [] 63 | }, 64 | { 65 | name: "Users", 66 | contents: [ 67 | { name: "pavanpodila" }, 68 | { name: "admin" }, 69 | { name: "test-user" } 70 | ] 71 | } 72 | ] 73 | }; 74 | 75 | function visit(parent, visitFn, childrenFn) 76 | { 77 | if (!parent) return; 78 | 79 | visitFn(parent); 80 | 81 | var children = childrenFn(parent); 82 | if (children) { 83 | var count = children.length; 84 | for (var i = 0; i < count; i++) { 85 | visit(children[i], visitFn, childrenFn); 86 | } 87 | } 88 | } 89 | 90 | function buildTree(containerName, customOptions) 91 | { 92 | // build the options object 93 | var options = $.extend({ 94 | nodeRadius: 5, fontSize: 12 95 | }, customOptions); 96 | 97 | 98 | // Calculate total nodes, max label length 99 | var totalNodes = 0; 100 | var maxLabelLength = 0; 101 | visit(treeData, function(d) 102 | { 103 | totalNodes++; 104 | maxLabelLength = Math.max(d.name.length, maxLabelLength); 105 | }, function(d) 106 | { 107 | return d.contents && d.contents.length > 0 ? d.contents : null; 108 | }); 109 | 110 | // size of the diagram 111 | var size = { width:$(containerName).outerWidth(), height: totalNodes * 15}; 112 | 113 | var tree = d3.layout.tree() 114 | .sort(null) 115 | .size([size.height, size.width - maxLabelLength * options.fontSize]) 116 | .children(get_children); 117 | 118 | var nodes = tree.nodes({id: 0, name: '*'}); 119 | console.log(nodes); 120 | var links = tree.links(nodes); 121 | console.log(links); 122 | 123 | 124 | /* 125 | 126 | 127 | 128 | */ 129 | var layoutRoot = d3.select(containerName) 130 | .append("svg:svg").attr("width", size.width).attr("height", size.height) 131 | .append("svg:g") 132 | .attr("class", "container") 133 | .attr("transform", "translate(" + maxLabelLength + ",0)"); 134 | 135 | 136 | // Edges between nodes as a 137 | var link = d3.svg.diagonal() 138 | .projection(function(d) 139 | { 140 | return [d.y, d.x]; 141 | }); 142 | 143 | layoutRoot.selectAll("path.link") 144 | .data(links) 145 | .enter() 146 | .append("svg:path") 147 | .attr("class", "link") 148 | .attr("d", link); 149 | 150 | 151 | /* 152 | Nodes as 153 | 154 | 155 | 156 | 157 | */ 158 | var nodeGroup = layoutRoot.selectAll("g.node") 159 | .data(nodes) 160 | .enter() 161 | .append("svg:g") 162 | .attr("class", "node") 163 | .attr("transform", function(d) 164 | { 165 | return "translate(" + d.y + "," + d.x + ")"; 166 | }); 167 | 168 | nodeGroup.append("svg:circle") 169 | .attr("class", "node-dot") 170 | .attr("r", options.nodeRadius); 171 | 172 | nodeGroup.append("svg:text") 173 | .attr("text-anchor", function(d) 174 | { 175 | return d.children ? "end" : "start"; 176 | }) 177 | .attr("dx", function(d) 178 | { 179 | var gap = 2 * options.nodeRadius; 180 | return d.children ? -gap : gap; 181 | }) 182 | .attr("dy", 3) 183 | .text(function(d) 184 | { 185 | return d.name; 186 | }); 187 | 188 | } 189 | -------------------------------------------------------------------------------- /web/public/js/dracula_algorithms.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Various algorithms and data structures, licensed under the MIT-license. 3 | * (c) 2010 by Johann Philipp Strathausen 4 | * http://strathausen.eu 5 | * 6 | */ 7 | 8 | 9 | 10 | /* 11 | Bellman-Ford 12 | 13 | Path-finding algorithm, finds the shortest paths from one node to all nodes. 14 | 15 | 16 | Complexity 17 | 18 | O( |E| · |V| ), where E = edges and V = vertices (nodes) 19 | 20 | 21 | Constraints 22 | 23 | Can run on graphs with negative edge weights as long as they do not have 24 | any negative weight cycles. 25 | 26 | */ 27 | function bellman_ford(g, source) { 28 | 29 | /* STEP 1: initialisation */ 30 | for(var n in g.nodes) 31 | g.nodes[n].distance = Infinity; 32 | /* predecessors are implicitly null */ 33 | source.distance = 0; 34 | 35 | step("Initially, all distances are infinite and all predecessors are null."); 36 | 37 | /* STEP 2: relax each edge (this is at the heart of Bellman-Ford) */ 38 | /* repeat this for the number of nodes minus one */ 39 | for(var i = 1; i < g.nodes.length; i++) 40 | /* for each edge */ 41 | for(var e in g.edges) { 42 | var edge = g.edges[e]; 43 | if(edge.source.distance + edge.weight < edge.target.distance) { 44 | step("Relax edge between " + edge.source.id + " and " + edge.target.id + "."); 45 | edge.target.distance = edge.source.distance + edge.weight; 46 | edge.target.predecessor = edge.source; 47 | } 48 | //Added by Jake Stothard (Needs to be tested) 49 | if(!edge.style.directed) { 50 | if(edge.target.distance + edge.weight < edge.source.distance) { 51 | g.snapShot("Relax edge between "+edge.target.id+" and "+edge.source.id+"."); 52 | edge.source.distance = edge.target.distance + edge.weight; 53 | edge.source.predecessor = edge.target; 54 | } 55 | } 56 | } 57 | step("Ready."); 58 | 59 | /* STEP 3: TODO Check for negative cycles */ 60 | /* For now we assume here that the graph does not contain any negative 61 | weights cycles. (this is left as an excercise to the reader[tm]) */ 62 | } 63 | 64 | 65 | 66 | /* 67 | Path-finding algorithm Dijkstra 68 | 69 | - worst-case running time is O((|E| + |V|) · log |V| ) thus better than 70 | Bellman-Ford for sparse graphs (with less edges), but cannot handle 71 | negative edge weights 72 | */ 73 | function dijkstra(g, source) { 74 | 75 | /* initially, all distances are infinite and all predecessors are null */ 76 | for(var n in g.nodes) 77 | g.nodes[n].distance = Infinity; 78 | /* predecessors are implicitly null */ 79 | 80 | g.snapShot("Initially, all distances are infinite and all predecessors are null."); 81 | 82 | source.distance = 0; 83 | /* set of unoptimized nodes, sorted by their distance (but a Fibonacci heap 84 | would be better) */ 85 | var q = new BinaryMinHeap(g.nodes, "distance"); 86 | 87 | /* pointer to the node in focus */ 88 | var node; 89 | 90 | /* get the node with the smallest distance 91 | as long as we have unoptimized nodes. q.min() can have O(log n). */ 92 | while(q.min() != undefined) { 93 | /* remove the latest */ 94 | node = q.extractMin(); 95 | node.optimized = true; 96 | 97 | /* no nodes accessible from this one, should not happen */ 98 | if(node.distance == Infinity) 99 | throw "Orphaned node!"; 100 | 101 | /* for each neighbour of node */ 102 | for(e in node.edges) { 103 | var other = (node == node.edges[e].target) ? node.edges[e].source : node.edges[e].target; 104 | 105 | if(other.optimized) 106 | continue; 107 | 108 | /* look for an alternative route */ 109 | var alt = node.distance + node.edges[e].weight; 110 | 111 | /* update distance and route if a better one has been found */ 112 | if (alt < other.distance) { 113 | 114 | /* update distance of neighbour */ 115 | other.distance = alt; 116 | 117 | /* update priority queue */ 118 | q.heapify(); 119 | 120 | /* update path */ 121 | other.predecessor = node; 122 | g.snapShot("Enhancing node.") 123 | } 124 | } 125 | } 126 | } 127 | 128 | 129 | /* All-Pairs-Shortest-Paths */ 130 | /* Runs at worst in O(|V|³) and at best in Omega(|V|³) :-) 131 | complexity Sigma(|V|²) */ 132 | /* This implementation is not yet ready for general use, but works with the 133 | Dracula graph library. */ 134 | function floyd_warshall(g, source) { 135 | 136 | /* Step 1: initialising empty path matrix (second dimension is implicit) */ 137 | var path = []; 138 | var next = []; 139 | var n = g.nodes.length; 140 | 141 | /* construct path matrix, initialize with Infinity */ 142 | for(j in g.nodes) { 143 | path[j] = []; 144 | next[j] = []; 145 | for(i in g.nodes) 146 | path[j][i] = j == i ? 0 : Infinity; 147 | } 148 | 149 | /* initialize path with edge weights */ 150 | for(e in g.edges) 151 | path[g.edges[e].source.id][g.edges[e].target.id] = g.edges[e].weight; 152 | 153 | /* Note: Usually, the initialisation is done by getting the edge weights 154 | from a node matrix representation of the graph, not by iterating through 155 | a list of edges as done here. */ 156 | 157 | /* Step 2: find best distances (the heart of Floyd-Warshall) */ 158 | for(k in g.nodes){ 159 | for(i in g.nodes) { 160 | for(j in g.nodes) 161 | if(path[i][j] > path[i][k] + path[k][j]) { 162 | path[i][j] = path[i][k] + path[k][j]; 163 | /* Step 2.b: remember the path */ 164 | next[i][j] = k; 165 | } 166 | } 167 | } 168 | 169 | /* Step 3: Path reconstruction, get shortest path */ 170 | function getPath(i, j) { 171 | if(path[i][j] == Infinity) 172 | throw "There is no path."; 173 | var intermediate = next[i][j]; 174 | if(intermediate == undefined) 175 | return null; 176 | else 177 | return getPath(i, intermediate) 178 | .concat([intermediate]) 179 | .concat(getPath(intermediate, j)); 180 | } 181 | 182 | /* TODO use the knowledge, e.g. mark path in graph */ 183 | } 184 | 185 | /* 186 | Ford-Fulkerson 187 | 188 | Max-Flow-Min-Cut Algorithm finding the maximum flow through a directed 189 | graph from source to sink. 190 | 191 | 192 | Complexity 193 | 194 | O(E * max(f)), max(f) being the maximum flow 195 | 196 | 197 | Description 198 | 199 | As long as there is an open path through the residual graph, send the 200 | minimum of the residual capacities on the path. 201 | 202 | 203 | Constraints 204 | 205 | The algorithm works only if all weights are integers. Otherwise it is 206 | possible that the Ford–Fulkerson algorithm will not converge to the maximum 207 | value. 208 | 209 | 210 | Input 211 | 212 | g - Graph object 213 | s - Source ID 214 | t - Target (sink) ID 215 | 216 | 217 | Output 218 | 219 | Maximum flow from Source s to Target t 220 | 221 | */ 222 | /* 223 | Edmonds-Karp 224 | 225 | Max-Flow-Min-Cut Algorithm finding the maximum flow through a directed 226 | graph from source to sink. An implementation of the Ford-Fulkerson 227 | algorithm. 228 | 229 | 230 | Complexity 231 | 232 | O(|V|*|E|²) 233 | 234 | 235 | Input 236 | 237 | g - Graph object (with node and edge lists, capacity is a property of edge) 238 | s - source ID 239 | t - sink ID 240 | 241 | */ 242 | function edmonds_karp(g, s, t) { 243 | 244 | } 245 | 246 | /* 247 | A simple binary min-heap serving as a priority queue 248 | - takes an array as the input, with elements having a key property 249 | - elements will look like this: 250 | { 251 | key: "... key property ...", 252 | value: "... element content ..." 253 | } 254 | - provides insert(), min(), extractMin() and heapify() 255 | - example usage (e.g. via the Firebug or Chromium console): 256 | var x = {foo: 20, hui: "bla"}; 257 | var a = new BinaryMinHeap([x,{foo:3},{foo:10},{foo:20},{foo:30},{foo:6},{foo:1},{foo:3}],"foo"); 258 | console.log(a.extractMin()); 259 | console.log(a.extractMin()); 260 | x.foo = 0; // update key 261 | a.heapify(); // call this always after having a key updated 262 | console.log(a.extractMin()); 263 | console.log(a.extractMin()); 264 | - can also be used on a simple array, like [9,7,8,5] 265 | */ 266 | function BinaryMinHeap(array, key) { 267 | 268 | /* Binary tree stored in an array, no need for a complicated data structure */ 269 | var tree = []; 270 | 271 | var key = key || 'key'; 272 | 273 | /* Calculate the index of the parent or a child */ 274 | var parent = function(index) { return Math.floor((index - 1)/2); }; 275 | var right = function(index) { return 2 * index + 2; }; 276 | var left = function(index) { return 2 * index + 1; }; 277 | 278 | /* Helper function to swap elements with their parent 279 | as long as the parent is bigger */ 280 | function bubble_up(i) { 281 | var p = parent(i); 282 | while((p >= 0) && (tree[i][key] < tree[p][key])) { 283 | /* swap with parent */ 284 | tree[i] = tree.splice(p, 1, tree[i])[0]; 285 | /* go up one level */ 286 | i = p; 287 | p = parent(i); 288 | } 289 | } 290 | 291 | /* Helper function to swap elements with the smaller of their children 292 | as long as there is one */ 293 | function bubble_down(i) { 294 | var l = left(i); 295 | var r = right(i); 296 | 297 | /* as long as there are smaller children */ 298 | while(tree[l] && (tree[i][key] > tree[l][key]) || tree[r] && (tree[i][key] > tree[r][key])) { 299 | 300 | /* find smaller child */ 301 | var child = tree[l] ? tree[r] ? tree[l][key] > tree[r][key] ? r : l : l : l; 302 | 303 | /* swap with smaller child with current element */ 304 | tree[i] = tree.splice(child, 1, tree[i])[0]; 305 | 306 | /* go up one level */ 307 | i = child; 308 | l = left(i); 309 | r = right(i); 310 | } 311 | } 312 | 313 | /* Insert a new element with respect to the heap property 314 | 1. Insert the element at the end 315 | 2. Bubble it up until it is smaller than its parent */ 316 | this.insert = function(element) { 317 | 318 | /* make sure there's a key property */ 319 | (element[key] == undefined) && (element = {key:element}); 320 | 321 | /* insert element at the end */ 322 | tree.push(element); 323 | 324 | /* bubble up the element */ 325 | bubble_up(tree.length - 1); 326 | } 327 | 328 | /* Only show us the minimum */ 329 | this.min = function() { 330 | return tree.length == 1 ? undefined : tree[0]; 331 | } 332 | 333 | /* Return and remove the minimum 334 | 1. Take the root as the minimum that we are looking for 335 | 2. Move the last element to the root (thereby deleting the root) 336 | 3. Compare the new root with both of its children, swap it with the 337 | smaller child and then check again from there (bubble down) 338 | */ 339 | this.extractMin = function() { 340 | var result = this.min(); 341 | 342 | /* move the last element to the root or empty the tree completely */ 343 | /* bubble down the new root if necessary */ 344 | (tree.length == 1) && (tree = []) || (tree[0] = tree.pop()) && bubble_down(0); 345 | 346 | return result; 347 | } 348 | 349 | /* currently unused, TODO implement */ 350 | this.changeKey = function(index, key) { 351 | throw "function not implemented"; 352 | } 353 | 354 | this.heapify = function() { 355 | for(var start = Math.floor((tree.length - 2) / 2); start >= 0; start--) { 356 | bubble_down(start); 357 | } 358 | } 359 | 360 | /* insert the input elements one by one only when we don't have a key property (TODO can be done more elegant) */ 361 | for(i in (array || [])) 362 | this.insert(array[i]); 363 | } 364 | 365 | 366 | 367 | /* 368 | Quick Sort: 369 | 1. Select some random value from the array, the median. 370 | 2. Divide the array in three smaller arrays according to the elements 371 | being less, equal or greater than the median. 372 | 3. Recursively sort the array containg the elements less than the 373 | median and the one containing elements greater than the median. 374 | 4. Concatenate the three arrays (less, equal and greater). 375 | 5. One or no element is always sorted. 376 | TODO: This could be implemented more efficiently by using only one array object and several pointers. 377 | */ 378 | function quickSort(arr) { 379 | /* recursion anchor: one element is always sorted */ 380 | if(arr.length <= 1) return arr; 381 | /* randomly selecting some value */ 382 | var median = arr[Math.floor(Math.random() * arr.length)]; 383 | var arr1 = [], arr2 = [], arr3 = []; 384 | for(var i in arr) { 385 | arr[i] < median && arr1.push(arr[i]); 386 | arr[i] == median && arr2.push(arr[i]); 387 | arr[i] > median && arr3.push(arr[i]); 388 | } 389 | /* recursive sorting and assembling final result */ 390 | return quickSort(arr1).concat(arr2).concat(quickSort(arr3)); 391 | } 392 | 393 | /* 394 | Selection Sort: 395 | 1. Select the minimum and remove it from the array 396 | 2. Sort the rest recursively 397 | 3. Return the minimum plus the sorted rest 398 | 4. An array with only one element is already sorted 399 | */ 400 | function selectionSort(arr) { 401 | /* recursion anchor: one element is always sorted */ 402 | if(arr.length == 1) return arr; 403 | var minimum = Infinity; 404 | var index; 405 | for(var i in arr) { 406 | if(arr[i] < minimum) { 407 | minimum = arr[i]; 408 | index = i; /* remember the minimum index for later removal */ 409 | } 410 | } 411 | /* remove the minimum */ 412 | arr.splice(index, 1); 413 | /* assemble result and sort recursively (could be easily done iteratively as well)*/ 414 | return [minimum].concat(selectionSort(arr)); 415 | } 416 | 417 | /* 418 | Merge Sort: 419 | 1. Cut the array in half 420 | 2. Sort each of them recursively 421 | 3. Merge the two sorted arrays 422 | 4. An array with only one element is considered sorted 423 | 424 | */ 425 | function mergeSort(arr) { 426 | /* merges two sorted arrays into one sorted array */ 427 | function merge(a, b) { 428 | /* result set */ 429 | var c = []; 430 | /* as long as there are elements in the arrays to be merged */ 431 | while(a.length > 0 || b.length > 0){ 432 | /* are there elements to be merged, if yes, compare them and merge */ 433 | var n = a.length > 0 && b.length > 0 ? a[0] < b[0] ? a.shift() : b.shift() : b.length > 0 ? b.shift() : a.length > 0 ? a.shift() : null; 434 | /* always push the smaller one onto the result set */ 435 | n != null && c.push(n); 436 | } 437 | return c; 438 | } 439 | /* this mergeSort implementation cuts the array in half, wich should be fine with randomized arrays, but introduces the risk of a worst-case scenario */ 440 | median = Math.floor(arr.length / 2); 441 | var part1 = arr.slice(0, median); /* for some reason it doesn't work if inserted directly in the return statement (tried so with firefox) */ 442 | var part2 = arr.slice(median - arr.length); 443 | return arr.length <= 1 ? arr : merge( 444 | mergeSort(part1), /* first half */ 445 | mergeSort(part2) /* second half */ 446 | ); 447 | } 448 | 449 | /* Balanced Red-Black-Tree */ 450 | function RedBlackTree(arr) { 451 | 452 | } 453 | 454 | function BTree(arr) { 455 | 456 | } 457 | 458 | function NaryTree(n, arr) { 459 | 460 | } 461 | 462 | /** 463 | * Knuth-Morris-Pratt string matching algorithm - finds a pattern in a text. 464 | * FIXME: Doesn't work correctly yet. 465 | */ 466 | function kmp(p, t) { 467 | 468 | /** 469 | * PREFIX, OVERLAP or FALIURE function for KMP. Computes how many iterations 470 | * the algorithm can skip after a mismatch. 471 | * 472 | * @input p - pattern (string) 473 | * @result array of skippable iterations 474 | */ 475 | function prefix(p) { 476 | /* pi contains the computed skip marks */ 477 | var pi = [0], k = 0; 478 | for(q = 1; q < p.length; q++) { 479 | while(k > 0 && (p.charAt(k) != p.charAt(q))) 480 | k = pi[k-1]; 481 | 482 | (p.charAt(k) == p.charAt(q)) && k++; 483 | 484 | pi[q] = k; 485 | } 486 | return pi; 487 | } 488 | 489 | /* The actual KMP algorithm starts here. */ 490 | 491 | var pi = prefix(p), q = 0, result = []; 492 | 493 | for(var i = 0; i < t.length; i++) { 494 | /* jump forward as long as the character doesn't match */ 495 | while((q > 0) && (p.charAt(q) != t.charAt(i))) 496 | q = pi[q]; 497 | 498 | (p.charAt(q) == t.charAt(i)) && q++; 499 | 500 | (q == p.length) && result.push(i - p.length) && (q = pi[q]); 501 | } 502 | 503 | return result; 504 | } 505 | 506 | /* step for algorithm visualisation */ 507 | function step(comment, funct) { 508 | //wait for input 509 | //display comment (before or after waiting) 510 | // next.wait(); 511 | /* execute callback function */ 512 | funct(); 513 | } 514 | 515 | /** 516 | * Curry - Function currying 517 | * Copyright (c) 2008 Ariel Flesler - aflesler(at)gmail(dot)com | http://flesler.blogspot.com 518 | * Licensed under BSD (http://www.opensource.org/licenses/bsd-license.php) 519 | * Date: 10/4/2008 520 | * 521 | * @author Ariel Flesler 522 | * @version 1.0.1 523 | */ 524 | function curry( fn ){ 525 | return function(){ 526 | var args = curry.args(arguments), 527 | master = arguments.callee, 528 | self = this; 529 | 530 | return args.length >= fn.length ? fn.apply(self,args) : function(){ 531 | return master.apply( self, args.concat(curry.args(arguments)) ); 532 | }; 533 | }; 534 | }; 535 | 536 | curry.args = function( args ){ 537 | return Array.prototype.slice.call(args); 538 | }; 539 | 540 | Function.prototype.curry = function(){ 541 | return curry(this); 542 | }; 543 | 544 | /** 545 | * Topological Sort 546 | * 547 | * Sort a directed graph based on incoming edges 548 | * 549 | * Coded by Jake Stothard 550 | */ 551 | function topological_sort(g) { 552 | //Mark nodes as "deleted" instead of actually deleting them 553 | //That way we don't have to copy g 554 | 555 | for(i in g.nodes) 556 | g.nodes[i].deleted = false; 557 | 558 | var ret = topological_sort_helper(g); 559 | 560 | //Cleanup: Remove the deleted property 561 | for(i in g.nodes) 562 | delete g.nodes[i].deleted 563 | 564 | return ret; 565 | } 566 | function topological_sort_helper(g) { 567 | //Find node with no incoming edges 568 | var node; 569 | for(i in g.nodes) { 570 | if(g.nodes[i].deleted) 571 | continue; //Bad style, meh 572 | 573 | var incoming = false; 574 | for(j in g.nodes[i].edges) { 575 | if(g.nodes[i].edges[j].target == g.nodes[i] 576 | && g.nodes[i].edges[j].source.deleted == false) { 577 | incoming = true; 578 | break; 579 | } 580 | } 581 | if(!incoming) { 582 | node = g.nodes[i]; 583 | break; 584 | } 585 | } 586 | 587 | // Either unsortable or done. Either way, GTFO 588 | if(node == undefined) 589 | return []; 590 | 591 | //"Delete" node from g 592 | node.deleted = true; 593 | 594 | var tail = topological_sort_helper(g); 595 | 596 | tail.unshift(node); 597 | 598 | return tail; 599 | } 600 | -------------------------------------------------------------------------------- /web/public/js/dracula_graffle.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Originally grabbed from the official RaphaelJS Documentation 3 | * http://raphaeljs.com/graffle.html 4 | * Adopted (arrows) and commented by Philipp Strathausen http://blog.ameisenbar.de 5 | * Licenced under the MIT licence. 6 | */ 7 | 8 | /** 9 | * Usage: 10 | * connect two shapes 11 | * parameters: 12 | * source shape [or connection for redrawing], 13 | * target shape, 14 | * style with { fg : linecolor, bg : background color, directed: boolean } 15 | * returns: 16 | * connection { draw = function() } 17 | */ 18 | Raphael.fn.connection = function (obj1, obj2, style) { 19 | var selfRef = this; 20 | /* create and return new connection */ 21 | var edge = {/* 22 | from : obj1, 23 | to : obj2, 24 | style : style,*/ 25 | draw : function() { 26 | /* get bounding boxes of target and source */ 27 | var bb1 = obj1.getBBox(); 28 | var bb2 = obj2.getBBox(); 29 | var off1 = 0; 30 | var off2 = 0; 31 | /* coordinates for potential connection coordinates from/to the objects */ 32 | var p = [ 33 | {x: bb1.x + bb1.width / 2, y: bb1.y - off1}, /* NORTH 1 */ 34 | {x: bb1.x + bb1.width / 2, y: bb1.y + bb1.height + off1}, /* SOUTH 1 */ 35 | {x: bb1.x - off1, y: bb1.y + bb1.height / 2}, /* WEST 1 */ 36 | {x: bb1.x + bb1.width + off1, y: bb1.y + bb1.height / 2}, /* EAST 1 */ 37 | {x: bb2.x + bb2.width / 2, y: bb2.y - off2}, /* NORTH 2 */ 38 | {x: bb2.x + bb2.width / 2, y: bb2.y + bb2.height + off2}, /* SOUTH 2 */ 39 | {x: bb2.x - off2, y: bb2.y + bb2.height / 2}, /* WEST 2 */ 40 | {x: bb2.x + bb2.width + off2, y: bb2.y + bb2.height / 2} /* EAST 2 */ 41 | ]; 42 | 43 | /* distances between objects and according coordinates connection */ 44 | var d = {}, dis = []; 45 | 46 | /* 47 | * find out the best connection coordinates by trying all possible ways 48 | */ 49 | /* loop the first object's connection coordinates */ 50 | for (var i = 0; i < 4; i++) { 51 | /* loop the seond object's connection coordinates */ 52 | for (var j = 4; j < 8; j++) { 53 | var dx = Math.abs(p[i].x - p[j].x), 54 | dy = Math.abs(p[i].y - p[j].y); 55 | if ((i == j - 4) || (((i != 3 && j != 6) || p[i].x < p[j].x) && ((i != 2 && j != 7) || p[i].x > p[j].x) && ((i != 0 && j != 5) || p[i].y > p[j].y) && ((i != 1 && j != 4) || p[i].y < p[j].y))) { 56 | dis.push(dx + dy); 57 | d[dis[dis.length - 1].toFixed(3)] = [i, j]; 58 | } 59 | } 60 | } 61 | var res = dis.length == 0 ? [0, 4] : d[Math.min.apply(Math, dis).toFixed(3)]; 62 | /* bezier path */ 63 | var x1 = p[res[0]].x, 64 | y1 = p[res[0]].y, 65 | x4 = p[res[1]].x, 66 | y4 = p[res[1]].y, 67 | dx = Math.max(Math.abs(x1 - x4) / 2, 10), 68 | dy = Math.max(Math.abs(y1 - y4) / 2, 10), 69 | x2 = [x1, x1, x1 - dx, x1 + dx][res[0]].toFixed(3), 70 | y2 = [y1 - dy, y1 + dy, y1, y1][res[0]].toFixed(3), 71 | x3 = [0, 0, 0, 0, x4, x4, x4 - dx, x4 + dx][res[1]].toFixed(3), 72 | y3 = [0, 0, 0, 0, y1 + dy, y1 - dy, y4, y4][res[1]].toFixed(3); 73 | /* assemble path and arrow */ 74 | var path = ["M", x1.toFixed(3), y1.toFixed(3), "C", x2, y2, x3, y3, x4.toFixed(3), y4.toFixed(3)].join(","); 75 | /* arrow */ 76 | if(style && style.directed) { 77 | /* magnitude, length of the last path vector */ 78 | var mag = Math.sqrt((y4 - y3) * (y4 - y3) + (x4 - x3) * (x4 - x3)); 79 | /* vector normalisation to specified length */ 80 | var norm = function(x,l){return (-x*(l||5)/mag);}; 81 | /* calculate array coordinates (two lines orthogonal to the path vector) */ 82 | var arr = [ 83 | {x:(norm(x4-x3)+norm(y4-y3)+x4).toFixed(3), y:(norm(y4-y3)+norm(x4-x3)+y4).toFixed(3)}, 84 | {x:(norm(x4-x3)-norm(y4-y3)+x4).toFixed(3), y:(norm(y4-y3)-norm(x4-x3)+y4).toFixed(3)} 85 | ]; 86 | path = path + ",M"+arr[0].x+","+arr[0].y+",L"+x4+","+y4+",L"+arr[1].x+","+arr[1].y; 87 | } 88 | /* function to be used for moving existent path(s), e.g. animate() or attr() */ 89 | var move = "attr"; 90 | /* applying path(s) */ 91 | edge.fg && edge.fg[move]({path:path}) 92 | || (edge.fg = selfRef.path(path).attr({stroke: style && style.stroke || "#000", fill: "none"}).toBack()); 93 | edge.bg && edge.bg[move]({path:path}) 94 | || style && style.fill && (edge.bg = style.fill.split && selfRef.path(path).attr({stroke: style.fill.split("|")[0], fill: "none", "stroke-width": style.fill.split("|")[1] || 3}).toBack()); 95 | /* setting label */ 96 | style && style.label 97 | && (edge.label && edge.label.attr({x:(x1+x4)/2, y:(y1+y4)/2}) 98 | || (edge.label = selfRef.text((x1+x4)/2, (y1+y4)/2, style.label).attr({fill: "#000", "font-size": style["font-size"] || "12px"}))); 99 | style && style.label && style["label-style"] && edge.label && edge.label.attr(style["label-style"]); 100 | style && style.callback && style.callback(edge); 101 | } 102 | } 103 | edge.draw(); 104 | return edge; 105 | }; 106 | //Raphael.prototype.set.prototype.dodo=function(){console.log("works");}; 107 | -------------------------------------------------------------------------------- /web/public/js/dracula_graph.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Dracula Graph Layout and Drawing Framework 0.0.3alpha 3 | * (c) 2010 Philipp Strathausen , http://strathausen.eu 4 | * Contributions by Jake Stothard . 5 | * 6 | * based on the Graph JavaScript framework, version 0.0.1 7 | * (c) 2006 Aslak Hellesoy 8 | * (c) 2006 Dave Hoover 9 | * 10 | * Ported from Graph::Layouter::Spring in 11 | * http://search.cpan.org/~pasky/Graph-Layderer-0.02/ 12 | * The algorithm is based on a spring-style layouter of a Java-based social 13 | * network tracker PieSpy written by Paul Mutton . 14 | * 15 | * This code is freely distributable under the MIT license. Commercial use is 16 | * hereby granted without any cost or restriction. 17 | * 18 | * Links: 19 | * 20 | * Graph Dracula JavaScript Framework: 21 | * http://graphdracula.net 22 | * 23 | /*--------------------------------------------------------------------------*/ 24 | 25 | /* 26 | * Edge Factory 27 | */ 28 | var AbstractEdge = function() { 29 | } 30 | AbstractEdge.prototype = { 31 | hide: function() { 32 | this.connection.fg.hide(); 33 | this.connection.bg && this.bg.connection.hide(); 34 | } 35 | }; 36 | var EdgeFactory = function() { 37 | this.template = new AbstractEdge(); 38 | this.template.style = new Object(); 39 | this.template.style.directed = false; 40 | this.template.weight = 1; 41 | }; 42 | EdgeFactory.prototype = { 43 | build: function(source, target) { 44 | var e = jQuery.extend(true, {}, this.template); 45 | e.source = source; 46 | e.target = target; 47 | return e; 48 | } 49 | }; 50 | 51 | /* 52 | * Graph 53 | */ 54 | var Graph = function() { 55 | this.nodes = {}; 56 | this.edges = []; 57 | this.snapshots = []; // previous graph states TODO to be implemented 58 | this.edgeFactory = new EdgeFactory(); 59 | }; 60 | Graph.prototype = { 61 | /* 62 | * add a node 63 | * @id the node's ID (string or number) 64 | * @content (optional, dictionary) can contain any information that is 65 | * being interpreted by the layout algorithm or the graph 66 | * representation 67 | */ 68 | addNode: function(id, content) { 69 | /* testing if node is already existing in the graph */ 70 | if(this.nodes[id] == undefined) { 71 | this.nodes[id] = new Graph.Node(id, content); 72 | } 73 | return this.nodes[id]; 74 | }, 75 | 76 | addEdge: function(source, target, style) { 77 | var s = this.addNode(source); 78 | var t = this.addNode(target); 79 | var edge = this.edgeFactory.build(s, t); 80 | jQuery.extend(edge.style,style); 81 | s.edges.push(edge); 82 | this.edges.push(edge); 83 | // NOTE: Even directed edges are added to both nodes. 84 | t.edges.push(edge); 85 | }, 86 | 87 | /* TODO to be implemented 88 | * Preserve a copy of the graph state (nodes, positions, ...) 89 | * @comment a comment describing the state 90 | */ 91 | snapShot: function(comment) { 92 | /* FIXME 93 | var graph = new Graph(); 94 | graph.nodes = jQuery.extend(true, {}, this.nodes); 95 | graph.edges = jQuery.extend(true, {}, this.edges); 96 | this.snapshots.push({comment: comment, graph: graph}); 97 | */ 98 | }, 99 | removeNode: function(id) { 100 | delete this.nodes[id]; 101 | for(var i = 0; i < this.edges.length; i++) { 102 | if (this.edges[i].source.id == id || this.edges[i].target.id == id) { 103 | this.edges.splice(i, 1); 104 | i--; 105 | } 106 | } 107 | } 108 | }; 109 | 110 | /* 111 | * Node 112 | */ 113 | Graph.Node = function(id, node){ 114 | node = node || {}; 115 | node.id = id; 116 | node.edges = []; 117 | node.hide = function() { 118 | this.hidden = true; 119 | this.shape && this.shape.hide(); /* FIXME this is representation specific code and should be elsewhere */ 120 | for(i in this.edges) 121 | (this.edges[i].source.id == id || this.edges[i].target == id) && this.edges[i].hide && this.edges[i].hide(); 122 | }; 123 | node.show = function() { 124 | this.hidden = false; 125 | this.shape && this.shape.show(); 126 | for(i in this.edges) 127 | (this.edges[i].source.id == id || this.edges[i].target == id) && this.edges[i].show && this.edges[i].show(); 128 | }; 129 | return node; 130 | }; 131 | Graph.Node.prototype = { 132 | }; 133 | 134 | /* 135 | * Renderer base class 136 | */ 137 | Graph.Renderer = {}; 138 | 139 | /* 140 | * Renderer implementation using RaphaelJS 141 | */ 142 | Graph.Renderer.Raphael = function(element, graph, width, height) { 143 | this.width = width || 400; 144 | this.height = height || 400; 145 | var selfRef = this; 146 | this.r = Raphael(element, this.width, this.height); 147 | this.radius = 40; /* max dimension of a node */ 148 | this.graph = graph; 149 | this.mouse_in = false; 150 | 151 | /* TODO default node rendering function */ 152 | if(!this.graph.render) { 153 | this.graph.render = function() { 154 | return; 155 | } 156 | } 157 | 158 | /* 159 | * Dragging 160 | */ 161 | this.isDrag = false; 162 | this.dragger = function (e) { 163 | this.dx = e.clientX; 164 | this.dy = e.clientY; 165 | selfRef.isDrag = this; 166 | this.set && this.set.animate({"fill-opacity": .1}, 200) && this.set.toFront(); 167 | e.preventDefault && e.preventDefault(); 168 | }; 169 | 170 | var d = document.getElementById(element); 171 | d.onmousemove = function (e) { 172 | e = e || window.event; 173 | if (selfRef.isDrag) { 174 | var bBox = selfRef.isDrag.set.getBBox(); 175 | // TODO round the coordinates here (eg. for proper image representation) 176 | var newX = e.clientX - selfRef.isDrag.dx + (bBox.x + bBox.width / 2); 177 | var newY = e.clientY - selfRef.isDrag.dy + (bBox.y + bBox.height / 2); 178 | /* prevent shapes from being dragged out of the canvas */ 179 | var clientX = e.clientX - (newX < 20 ? newX - 20 : newX > selfRef.width - 20 ? newX - selfRef.width + 20 : 0); 180 | var clientY = e.clientY - (newY < 20 ? newY - 20 : newY > selfRef.height - 20 ? newY - selfRef.height + 20 : 0); 181 | selfRef.isDrag.set.translate(clientX - Math.round(selfRef.isDrag.dx), clientY - Math.round(selfRef.isDrag.dy)); 182 | // console.log(clientX - Math.round(selfRef.isDrag.dx), clientY - Math.round(selfRef.isDrag.dy)); 183 | for (var i in selfRef.graph.edges) { 184 | selfRef.graph.edges[i].connection && selfRef.graph.edges[i].connection.draw(); 185 | } 186 | //selfRef.r.safari(); 187 | selfRef.isDrag.dx = clientX; 188 | selfRef.isDrag.dy = clientY; 189 | } 190 | }; 191 | d.onmouseup = function () { 192 | selfRef.isDrag && selfRef.isDrag.set.animate({"fill-opacity": .6}, 500); 193 | selfRef.isDrag = false; 194 | }; 195 | this.draw(); 196 | }; 197 | Graph.Renderer.Raphael.prototype = { 198 | translate: function(point) { 199 | return [ 200 | (point[0] - this.graph.layoutMinX) * this.factorX + this.radius, 201 | (point[1] - this.graph.layoutMinY) * this.factorY + this.radius 202 | ]; 203 | }, 204 | 205 | rotate: function(point, length, angle) { 206 | var dx = length * Math.cos(angle); 207 | var dy = length * Math.sin(angle); 208 | return [point[0]+dx, point[1]+dy]; 209 | }, 210 | 211 | draw: function() { 212 | this.factorX = (this.width - 2 * this.radius) / (this.graph.layoutMaxX - this.graph.layoutMinX); 213 | this.factorY = (this.height - 2 * this.radius) / (this.graph.layoutMaxY - this.graph.layoutMinY); 214 | for (i in this.graph.nodes) { 215 | this.drawNode(this.graph.nodes[i]); 216 | } 217 | for (var i = 0; i < this.graph.edges.length; i++) { 218 | this.drawEdge(this.graph.edges[i]); 219 | } 220 | }, 221 | 222 | drawNode: function(node) { 223 | var point = this.translate([node.layoutPosX, node.layoutPosY]); 224 | node.point = point; 225 | 226 | /* if node has already been drawn, move the nodes */ 227 | if(node.shape) { 228 | var oBBox = node.shape.getBBox(); 229 | var opoint = { x: oBBox.x + oBBox.width / 2, y: oBBox.y + oBBox.height / 2}; 230 | node.shape.translate(Math.round(point[0] - opoint.x), Math.round(point[1] - opoint.y)); 231 | this.r.safari(); 232 | return node; 233 | }/* else, draw new nodes */ 234 | 235 | var shape; 236 | 237 | /* if a node renderer function is provided by the user, then use it 238 | or the default render function instead */ 239 | if(!node.render) { 240 | node.render = function(r, node) { 241 | /* the default node drawing */ 242 | var color = Raphael.getColor(); 243 | var ellipse = r.ellipse(0, 0, 30, 20).attr({fill: color, stroke: color, "stroke-width": 2}); 244 | /* set DOM node ID */ 245 | ellipse.node.id = node.label || node.id; 246 | shape = r.set(). 247 | push(ellipse). 248 | push(r.text(0, 30, node.label || node.id)); 249 | return shape; 250 | } 251 | } 252 | /* or check for an ajax representation of the nodes */ 253 | if(node.shapes) { 254 | // TODO ajax representation evaluation 255 | } 256 | 257 | shape = node.render(this.r, node).hide(); 258 | 259 | shape.attr({"fill-opacity": .6}); 260 | /* re-reference to the node an element belongs to, needed for dragging all elements of a node */ 261 | shape.items.forEach(function(item){ item.set = shape; item.node.style.cursor = "move"; }); 262 | shape.mousedown(this.dragger); 263 | 264 | var box = shape.getBBox(); 265 | shape.translate(Math.round(point[0]-(box.x+box.width/2)),Math.round(point[1]-(box.y+box.height/2))) 266 | //console.log(box,point); 267 | node.hidden || shape.show(); 268 | node.shape = shape; 269 | }, 270 | drawEdge: function(edge) { 271 | /* if this edge already exists the other way around and is undirected */ 272 | if(edge.backedge) 273 | return; 274 | if(edge.source.hidden || edge.target.hidden) { 275 | edge.connection && edge.connection.fg.hide() | edge.connection.bg && edge.connection.bg.hide(); 276 | return; 277 | } 278 | /* if edge already has been drawn, only refresh the edge */ 279 | if(!edge.connection) { 280 | edge.style && edge.style.callback && edge.style.callback(edge); // TODO move this somewhere else 281 | edge.connection = this.r.connection(edge.source.shape, edge.target.shape, edge.style); 282 | return; 283 | } 284 | //FIXME showing doesn't work well 285 | edge.connection.fg.show(); 286 | edge.connection.bg && edge.connection.bg.show(); 287 | edge.connection.draw(); 288 | } 289 | }; 290 | Graph.Layout = {}; 291 | Graph.Layout.Spring = function(graph) { 292 | this.graph = graph; 293 | this.iterations = 500; 294 | this.maxRepulsiveForceDistance = 6; 295 | this.k = 2; 296 | this.c = 0.01; 297 | this.maxVertexMovement = 0.5; 298 | this.layout(); 299 | }; 300 | Graph.Layout.Spring.prototype = { 301 | layout: function() { 302 | this.layoutPrepare(); 303 | for (var i = 0; i < this.iterations; i++) { 304 | this.layoutIteration(); 305 | } 306 | this.layoutCalcBounds(); 307 | }, 308 | 309 | layoutPrepare: function() { 310 | for (i in this.graph.nodes) { 311 | var node = this.graph.nodes[i]; 312 | node.layoutPosX = 0; 313 | node.layoutPosY = 0; 314 | node.layoutForceX = 0; 315 | node.layoutForceY = 0; 316 | } 317 | 318 | }, 319 | 320 | layoutCalcBounds: function() { 321 | var minx = Infinity, maxx = -Infinity, miny = Infinity, maxy = -Infinity; 322 | 323 | for (i in this.graph.nodes) { 324 | var x = this.graph.nodes[i].layoutPosX; 325 | var y = this.graph.nodes[i].layoutPosY; 326 | 327 | if(x > maxx) maxx = x; 328 | if(x < minx) minx = x; 329 | if(y > maxy) maxy = y; 330 | if(y < miny) miny = y; 331 | } 332 | 333 | this.graph.layoutMinX = minx; 334 | this.graph.layoutMaxX = maxx; 335 | this.graph.layoutMinY = miny; 336 | this.graph.layoutMaxY = maxy; 337 | }, 338 | 339 | layoutIteration: function() { 340 | // Forces on nodes due to node-node repulsions 341 | 342 | var prev = new Array(); 343 | for(var c in this.graph.nodes) { 344 | var node1 = this.graph.nodes[c]; 345 | for (var d in prev) { 346 | var node2 = this.graph.nodes[prev[d]]; 347 | this.layoutRepulsive(node1, node2); 348 | 349 | } 350 | prev.push(c); 351 | } 352 | 353 | // Forces on nodes due to edge attractions 354 | for (var i = 0; i < this.graph.edges.length; i++) { 355 | var edge = this.graph.edges[i]; 356 | this.layoutAttractive(edge); 357 | } 358 | 359 | // Move by the given force 360 | for (i in this.graph.nodes) { 361 | var node = this.graph.nodes[i]; 362 | var xmove = this.c * node.layoutForceX; 363 | var ymove = this.c * node.layoutForceY; 364 | 365 | var max = this.maxVertexMovement; 366 | if(xmove > max) xmove = max; 367 | if(xmove < -max) xmove = -max; 368 | if(ymove > max) ymove = max; 369 | if(ymove < -max) ymove = -max; 370 | 371 | node.layoutPosX += xmove; 372 | node.layoutPosY += ymove; 373 | node.layoutForceX = 0; 374 | node.layoutForceY = 0; 375 | } 376 | }, 377 | 378 | layoutRepulsive: function(node1, node2) { 379 | if (typeof node1 == 'undefined' || typeof node2 == 'undefined') 380 | return; 381 | var dx = node2.layoutPosX - node1.layoutPosX; 382 | var dy = node2.layoutPosY - node1.layoutPosY; 383 | var d2 = dx * dx + dy * dy; 384 | if(d2 < 0.01) { 385 | dx = 0.1 * Math.random() + 0.1; 386 | dy = 0.1 * Math.random() + 0.1; 387 | var d2 = dx * dx + dy * dy; 388 | } 389 | var d = Math.sqrt(d2); 390 | if(d < this.maxRepulsiveForceDistance) { 391 | var repulsiveForce = this.k * this.k / d; 392 | node2.layoutForceX += repulsiveForce * dx / d; 393 | node2.layoutForceY += repulsiveForce * dy / d; 394 | node1.layoutForceX -= repulsiveForce * dx / d; 395 | node1.layoutForceY -= repulsiveForce * dy / d; 396 | } 397 | }, 398 | 399 | layoutAttractive: function(edge) { 400 | var node1 = edge.source; 401 | var node2 = edge.target; 402 | 403 | var dx = node2.layoutPosX - node1.layoutPosX; 404 | var dy = node2.layoutPosY - node1.layoutPosY; 405 | var d2 = dx * dx + dy * dy; 406 | if(d2 < 0.01) { 407 | dx = 0.1 * Math.random() + 0.1; 408 | dy = 0.1 * Math.random() + 0.1; 409 | var d2 = dx * dx + dy * dy; 410 | } 411 | var d = Math.sqrt(d2); 412 | if(d > this.maxRepulsiveForceDistance) { 413 | d = this.maxRepulsiveForceDistance; 414 | d2 = d * d; 415 | } 416 | var attractiveForce = (d2 - this.k * this.k) / this.k; 417 | if(edge.attraction == undefined) edge.attraction = 1; 418 | attractiveForce *= Math.log(edge.attraction) * 0.5 + 1; 419 | 420 | node2.layoutForceX -= attractiveForce * dx / d; 421 | node2.layoutForceY -= attractiveForce * dy / d; 422 | node1.layoutForceX += attractiveForce * dx / d; 423 | node1.layoutForceY += attractiveForce * dy / d; 424 | } 425 | }; 426 | 427 | Graph.Layout.Ordered = function(graph, order) { 428 | this.graph = graph; 429 | this.order = order; 430 | this.layout(); 431 | }; 432 | Graph.Layout.Ordered.prototype = { 433 | layout: function() { 434 | this.layoutPrepare(); 435 | this.layoutCalcBounds(); 436 | }, 437 | 438 | layoutPrepare: function(order) { 439 | for (i in this.graph.nodes) { 440 | var node = this.graph.nodes[i]; 441 | node.layoutPosX = 0; 442 | node.layoutPosY = 0; 443 | } 444 | var counter = 0; 445 | for (i in this.order) { 446 | var node = this.order[i]; 447 | node.layoutPosX = counter; 448 | node.layoutPosY = Math.random(); 449 | counter++; 450 | } 451 | }, 452 | 453 | layoutCalcBounds: function() { 454 | var minx = Infinity, maxx = -Infinity, miny = Infinity, maxy = -Infinity; 455 | 456 | for (i in this.graph.nodes) { 457 | var x = this.graph.nodes[i].layoutPosX; 458 | var y = this.graph.nodes[i].layoutPosY; 459 | 460 | if(x > maxx) maxx = x; 461 | if(x < minx) minx = x; 462 | if(y > maxy) maxy = y; 463 | if(y < miny) miny = y; 464 | } 465 | 466 | this.graph.layoutMinX = minx; 467 | this.graph.layoutMaxX = maxx; 468 | 469 | this.graph.layoutMinY = miny; 470 | this.graph.layoutMaxY = maxy; 471 | } 472 | }; 473 | 474 | /* 475 | * usefull JavaScript extensions, 476 | */ 477 | 478 | function log(a) {console.log&&console.log(a);} 479 | 480 | /* 481 | * Raphael Tooltip Plugin 482 | * - attaches an element as a tooltip to another element 483 | * 484 | * Usage example, adding a rectangle as a tooltip to a circle: 485 | * 486 | * paper.circle(100,100,10).tooltip(paper.rect(0,0,20,30)); 487 | * 488 | * If you want to use more shapes, you'll have to put them into a set. 489 | * 490 | */ 491 | Raphael.el.tooltip = function (tp) { 492 | this.tp = tp; 493 | this.tp.o = {x: 0, y: 0}; 494 | this.tp.hide(); 495 | this.hover( 496 | function(event){ 497 | this.mousemove(function(event){ 498 | this.tp.translate(event.clientX - 499 | this.tp.o.x,event.clientY - this.tp.o.y); 500 | this.tp.o = {x: event.clientX, y: event.clientY}; 501 | }); 502 | this.tp.show().toFront(); 503 | }, 504 | function(event){ 505 | this.tp.hide(); 506 | this.unmousemove(); 507 | }); 508 | return this; 509 | }; 510 | 511 | /* For IE */ 512 | if (!Array.prototype.forEach) 513 | { 514 | Array.prototype.forEach = function(fun /*, thisp*/) 515 | { 516 | var len = this.length; 517 | if (typeof fun != "function") 518 | throw new TypeError(); 519 | 520 | var thisp = arguments[1]; 521 | for (var i = 0; i < len; i++) 522 | { 523 | if (i in this) 524 | fun.call(thisp, this[i], i, this); 525 | } 526 | }; 527 | } 528 | -------------------------------------------------------------------------------- /web/public/js/html5.js: -------------------------------------------------------------------------------- 1 | // html5shiv @rem remysharp.com/html5-enabling-script 2 | // iepp v1.6.2 @jon_neal iecss.com/print-protector 3 | // Dual licensed under the MIT or GPL Version 2 licenses 4 | /*@cc_on(function(a,b){function r(a){var b=-1;while(++b";return a.childNodes.length!==1}())){a.iepp=a.iepp||{};var c=a.iepp,d=c.html5elements||"abbr|article|aside|audio|canvas|datalist|details|figcaption|figure|footer|header|hgroup|mark|meter|nav|output|progress|section|summary|time|video",e=d.split("|"),f=e.length,g=new RegExp("(^|\\s)("+d+")","gi"),h=new RegExp("<(/*)("+d+")","gi"),i=/^\s*[\{\}]\s*$/,j=new RegExp("(^|[^\\n]*?\\s)("+d+")([^\\n]*)({[\\n\\w\\W]*?})","gi"),k=b.createDocumentFragment(),l=b.documentElement,m=l.firstChild,n=b.createElement("body"),o=b.createElement("style"),p=/print|all/,q;c.getCSS=function(a,b){if(a+""===undefined)return"";var d=-1,e=a.length,f,g=[];while(++d 11 | // 12 | // Math.seedrandom('yipee'); Sets Math.random to a function that is 13 | // initialized using the given explicit seed. 14 | // 15 | // Math.seedrandom(); Sets Math.random to a function that is 16 | // seeded using the current time, dom state, 17 | // and other accumulated local entropy. 18 | // The generated seed string is returned. 19 | // 20 | // Math.seedrandom('yowza', true); 21 | // Seeds using the given explicit seed mixed 22 | // together with accumulated entropy. 23 | // 24 | // 25 | // Seeds using physical random bits downloaded 26 | // from random.org. 27 | // 28 | // Examples: 29 | // 30 | // Math.seedrandom("hello"); // Use "hello" as the seed. 31 | // document.write(Math.random()); // Always 0.5463663768140734 32 | // document.write(Math.random()); // Always 0.43973793770592234 33 | // var rng1 = Math.random; // Remember the current prng. 34 | // 35 | // var autoseed = Math.seedrandom(); // New prng with an automatic seed. 36 | // document.write(Math.random()); // Pretty much unpredictable. 37 | // 38 | // Math.random = rng1; // Continue "hello" prng sequence. 39 | // document.write(Math.random()); // Always 0.554769432473455 40 | // 41 | // Math.seedrandom(autoseed); // Restart at the previous seed. 42 | // document.write(Math.random()); // Repeat the 'unpredictable' value. 43 | // 44 | // Notes: 45 | // 46 | // Each time seedrandom('arg') is called, entropy from the passed seed 47 | // is accumulated in a pool to help generate future seeds for the 48 | // zero-argument form of Math.seedrandom, so entropy can be injected over 49 | // time by calling seedrandom with explicit data repeatedly. 50 | // 51 | // On speed - This javascript implementation of Math.random() is about 52 | // 3-10x slower than the built-in Math.random() because it is not native 53 | // code, but this is typically fast enough anyway. Seeding is more expensive, 54 | // especially if you use auto-seeding. Some details (timings on Chrome 4): 55 | // 56 | // Our Math.random() - avg less than 0.002 milliseconds per call 57 | // seedrandom('explicit') - avg less than 0.5 milliseconds per call 58 | // seedrandom('explicit', true) - avg less than 2 milliseconds per call 59 | // seedrandom() - avg about 38 milliseconds per call 60 | // 61 | // LICENSE (BSD): 62 | // 63 | // Copyright 2010 David Bau, all rights reserved. 64 | // 65 | // Redistribution and use in source and binary forms, with or without 66 | // modification, are permitted provided that the following conditions are met: 67 | // 68 | // 1. Redistributions of source code must retain the above copyright 69 | // notice, this list of conditions and the following disclaimer. 70 | // 71 | // 2. Redistributions in binary form must reproduce the above copyright 72 | // notice, this list of conditions and the following disclaimer in the 73 | // documentation and/or other materials provided with the distribution. 74 | // 75 | // 3. Neither the name of this module nor the names of its contributors may 76 | // be used to endorse or promote products derived from this software 77 | // without specific prior written permission. 78 | // 79 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 80 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 81 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 82 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 83 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 84 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 85 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 86 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 87 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 88 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 89 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 90 | // 91 | /** 92 | * All code is in an anonymous closure to keep the global namespace clean. 93 | * 94 | * @param {number=} overflow 95 | * @param {number=} startdenom 96 | */ 97 | (function (pool, math, width, chunks, significance, overflow, startdenom) { 98 | 99 | 100 | // 101 | // seedrandom() 102 | // This is the seedrandom function described above. 103 | // 104 | math['seedrandom'] = function seedrandom(seed, use_entropy) { 105 | var key = []; 106 | var arc4; 107 | 108 | // Flatten the seed string or build one from local entropy if needed. 109 | seed = mixkey(flatten( 110 | use_entropy ? [seed, pool] : 111 | arguments.length ? seed : 112 | [new Date().getTime(), pool, window], 3), key); 113 | 114 | // Use the seed to initialize an ARC4 generator. 115 | arc4 = new ARC4(key); 116 | 117 | // Mix the randomness into accumulated entropy. 118 | mixkey(arc4.S, pool); 119 | 120 | // Override Math.random 121 | 122 | // This function returns a random double in [0, 1) that contains 123 | // randomness in every bit of the mantissa of the IEEE 754 value. 124 | 125 | math['random'] = function random() { // Closure to return a random double: 126 | var n = arc4.g(chunks); // Start with a numerator n < 2 ^ 48 127 | var d = startdenom; // and denominator d = 2 ^ 48. 128 | var x = 0; // and no 'extra last byte'. 129 | while (n < significance) { // Fill up all significant digits by 130 | n = (n + x) * width; // shifting numerator and 131 | d *= width; // denominator and generating a 132 | x = arc4.g(1); // new least-significant-byte. 133 | } 134 | while (n >= overflow) { // To avoid rounding up, before adding 135 | n /= 2; // last byte, shift everything 136 | d /= 2; // right using integer math until 137 | x >>>= 1; // we have exactly the desired bits. 138 | } 139 | return (n + x) / d; // Form the number within [0, 1). 140 | }; 141 | 142 | // Return the seed that was used 143 | return seed; 144 | }; 145 | 146 | // 147 | // ARC4 148 | // 149 | // An ARC4 implementation. The constructor takes a key in the form of 150 | // an array of at most (width) integers that should be 0 <= x < (width). 151 | // 152 | // The g(count) method returns a pseudorandom integer that concatenates 153 | // the next (count) outputs from ARC4. Its return value is a number x 154 | // that is in the range 0 <= x < (width ^ count). 155 | // 156 | /** @constructor */ 157 | function ARC4(key) { 158 | var t, u, me = this, keylen = key.length; 159 | var i = 0, j = me.i = me.j = me.m = 0; 160 | me.S = []; 161 | me.c = []; 162 | 163 | // The empty key [] is treated as [0]. 164 | if (!keylen) { key = [keylen++]; } 165 | 166 | // Set up S using the standard key scheduling algorithm. 167 | while (i < width) { me.S[i] = i++; } 168 | for (i = 0; i < width; i++) { 169 | t = me.S[i]; 170 | j = lowbits(j + t + key[i % keylen]); 171 | u = me.S[j]; 172 | me.S[i] = u; 173 | me.S[j] = t; 174 | } 175 | 176 | // The "g" method returns the next (count) outputs as one number. 177 | me.g = function getnext(count) { 178 | var s = me.S; 179 | var i = lowbits(me.i + 1); var t = s[i]; 180 | var j = lowbits(me.j + t); var u = s[j]; 181 | s[i] = u; 182 | s[j] = t; 183 | var r = s[lowbits(t + u)]; 184 | while (--count) { 185 | i = lowbits(i + 1); t = s[i]; 186 | j = lowbits(j + t); u = s[j]; 187 | s[i] = u; 188 | s[j] = t; 189 | r = r * width + s[lowbits(t + u)]; 190 | } 191 | me.i = i; 192 | me.j = j; 193 | return r; 194 | }; 195 | // For robust unpredictability discard an initial batch of values. 196 | // See http://www.rsa.com/rsalabs/node.asp?id=2009 197 | me.g(width); 198 | } 199 | 200 | // 201 | // flatten() 202 | // Converts an object tree to nested arrays of strings. 203 | // 204 | /** @param {Object=} result 205 | * @param {string=} prop */ 206 | function flatten(obj, depth, result, prop) { 207 | result = []; 208 | if (depth && typeof(obj) == 'object') { 209 | for (prop in obj) { 210 | if (prop.indexOf('S') < 5) { // Avoid FF3 bug (local/sessionStorage) 211 | try { result.push(flatten(obj[prop], depth - 1)); } catch (e) {} 212 | } 213 | } 214 | } 215 | return result.length ? result : '' + obj; 216 | } 217 | 218 | // 219 | // mixkey() 220 | // Mixes a string seed into a key that is an array of integers, and 221 | // returns a shortened string seed that is equivalent to the result key. 222 | // 223 | /** @param {number=} smear 224 | * @param {number=} j */ 225 | function mixkey(seed, key, smear, j) { 226 | seed += ''; // Ensure the seed is a string 227 | smear = 0; 228 | for (j = 0; j < seed.length; j++) { 229 | key[lowbits(j)] = 230 | lowbits((smear ^= key[lowbits(j)] * 19) + seed.charCodeAt(j)); 231 | } 232 | seed = ''; 233 | for (j in key) { seed += String.fromCharCode(key[j]); } 234 | return seed; 235 | } 236 | 237 | // 238 | // lowbits() 239 | // A quick "n mod width" for width a power of 2. 240 | // 241 | function lowbits(n) { return n & (width - 1); } 242 | 243 | // 244 | // The following constants are related to IEEE 754 limits. 245 | // 246 | startdenom = math.pow(width, chunks); 247 | significance = math.pow(2, significance); 248 | overflow = significance * 2; 249 | 250 | // 251 | // When seedrandom.js is loaded, we immediately mix a few bits 252 | // from the built-in RNG into the entropy pool. Because we do 253 | // not want to intefere with determinstic PRNG state later, 254 | // seedrandom will not call math.random on its own again after 255 | // initialization. 256 | // 257 | mixkey(math.random(), pool); 258 | 259 | // End anonymous scope, and pass initial values. 260 | })( 261 | [], // pool: entropy pool starts empty 262 | Math, // math: package containing random, pow, and seedrandom 263 | 256, // width: each RC4 output is 0 <= x < 256 264 | 6, // chunks: at least six RC4 outputs for each double 265 | 52 // significance: there are 52 significant digits in a double 266 | ); 267 | -------------------------------------------------------------------------------- /web/public/js/sigma.min.js: -------------------------------------------------------------------------------- 1 | /* sigmajs.org - an open-source light-weight JavaScript graph drawing library - Version: 0.1 - Author: Alexis Jacomy - License: MIT */ 2 | var sigma={tools:{},classes:{},instances:{}}; 3 | (function(){if(!Array.prototype.some)Array.prototype.some=function(i,m){var f=this.length;if("function"!=typeof i)throw new TypeError;for(var j=0;j";a+="

LOCAL :

";for(b in d.p.localProbes)a+="

"+b+" : "+d.p.localProbes[b]()+"

";d.p.dom.innerHTML=a;return d}sigma.classes.Cascade.call(this);var d=this;this.instance=b;this.monitoring=!1;this.p={fps:40,dom:l,globalProbes:{"Time (ms)":sigma.chronos.getExecutionTime,Queue:sigma.chronos.getQueuedTasksCount,Tasks:sigma.chronos.getTasksCount,FPS:sigma.chronos.getFPS},localProbes:{"Nodes count":function(){return d.instance.graph.nodes.length},"Edges count":function(){return d.instance.graph.edges.length}}}; 22 | this.activate=function(){if(!d.monitoring)d.monitoring=window.setInterval(g,1E3/d.p.fps);return d};this.desactivate=function(){if(d.monitoring)window.clearInterval(d.monitoring),d.monitoring=null,d.p.dom.innerHTML="";return d}}function j(b){function l(b){if(a.p.mouseEnabled&&(g(a.mouseX,a.mouseY,a.ratio*(0<(void 0!=b.wheelDelta&&b.wheelDelta||void 0!=b.detail&&-b.detail)?a.p.zoomMultiply:1/a.p.zoomMultiply)),a.p.blockScroll))b.preventDefault?b.preventDefault():b.returnValue=!1}function g(b,c,g){if(!a.isMouseDown&& 23 | (window.clearInterval(a.interpolationID),w=void 0!=g,i=a.stageX,j=b,o=a.stageY,k=c,h=g||a.ratio,h=Math.min(Math.max(h,a.p.minRatio),a.p.maxRatio),m=a.p.directZooming?1-(w?a.p.zoomDelta:a.p.dragDelta):0,a.ratio!=h||a.stageX!=j||a.stageY!=k))d(),a.interpolationID=window.setInterval(d,50),a.dispatch("startinterpolate")}function d(){m+=w?a.p.zoomDelta:a.p.dragDelta;m=Math.min(m,1);var b=sigma.easing.quadratic.easeout(m),c=a.ratio;a.ratio=c*(1-b)+h*b;w?(a.stageX=j+(a.stageX-j)*a.ratio/c,a.stageY=k+(a.stageY- 24 | k)*a.ratio/c):(a.stageX=i*(1-b)+j*b,a.stageY=o*(1-b)+k*b);a.dispatch("interpolate");if(1<=m)window.clearInterval(a.interpolationID),b=a.ratio,w?(a.ratio=h,a.stageX=j+(a.stageX-j)*a.ratio/b,a.stageY=k+(a.stageY-k)*a.ratio/b):(a.stageX=j,a.stageY=k),a.dispatch("stopinterpolate")}sigma.classes.Cascade.call(this);sigma.classes.EventDispatcher.call(this);var a=this;this.p={minRatio:1,maxRatio:32,marginRatio:1,zoomDelta:0.1,dragDelta:0.3,zoomMultiply:2,directZooming:!1,blockScroll:!0,inertia:1.1,mouseEnabled:!0}; 25 | var f=0,c=0,i=0,o=0,h=1,j=0,k=0,s=0,p=0,C=0,n=0,m=0,w=!1;this.stageY=this.stageX=0;this.ratio=1;this.mouseY=this.mouseX=0;this.isMouseDown=!1;b.addEventListener("DOMMouseScroll",l,!0);b.addEventListener("mousewheel",l,!0);b.addEventListener("mousemove",function(b){a.mouseX=void 0!=b.offsetX&&b.offsetX||void 0!=b.layerX&&b.layerX||void 0!=b.clientX&&b.clientX;a.mouseY=void 0!=b.offsetY&&b.offsetY||void 0!=b.layerY&&b.layerY||void 0!=b.clientY&&b.clientY;if(a.isMouseDown){var d=a.mouseX-f+i,h=a.mouseY- 26 | c+o;if(d!=a.stageX||h!=a.stageY)p=s,n=C,s=d,C=h,a.stageX=d,a.stageY=h,a.dispatch("drag")}a.dispatch("move");b.preventDefault?b.preventDefault():b.returnValue=!1},!0);b.addEventListener("mousedown",function(b){if(a.p.mouseEnabled)a.isMouseDown=!0,a.dispatch("mousedown"),i=a.stageX,o=a.stageY,f=a.mouseX,c=a.mouseY,p=s=a.stageX,n=C=a.stageY,a.dispatch("startdrag"),b.preventDefault?b.preventDefault():b.returnValue=!1},!0);document.addEventListener("mouseup",function(b){if(a.p.mouseEnabled&&a.isMouseDown)a.isMouseDown= 27 | !1,a.dispatch("mouseup"),(i!=a.stageX||o!=a.stageY)&&g(a.stageX+a.p.inertia*(a.stageX-p),a.stageY+a.p.inertia*(a.stageY-n)),b.preventDefault?b.preventDefault():b.returnValue=!1},!0);this.checkBorders=function(){return a};this.interpolate=g}function n(b,l,g,d,a,f,c){function i(a){var b=d,c="fixed"==h.p.labelSize?h.p.defaultLabelSize:h.p.labelSizeRatio*a.displaySize;b.font=(h.p.hoverFontStyle||h.p.fontStyle||"")+" "+c+"px "+(h.p.hoverFont||h.p.font||"");b.fillStyle="node"==h.p.labelHoverBGColor?a.color|| 28 | h.p.defaultNodeColor:h.p.defaultHoverLabelBGColor;b.beginPath();if(h.p.labelHoverShadow)b.shadowOffsetX=0,b.shadowOffsetY=0,b.shadowBlur=4,b.shadowColor=h.p.labelHoverShadowColor;sigma.tools.drawRoundRect(b,Math.round(a.displayX-c/2-2),Math.round(a.displayY-c/2-2),Math.round(b.measureText(a.label).width+1.5*a.displaySize+c/2+4),Math.round(c+4),Math.round(c/2+2),"left");b.closePath();b.fill();b.shadowOffsetX=0;b.shadowOffsetY=0;b.shadowBlur=0;b.beginPath();b.fillStyle="node"==h.p.nodeBorderColor?a.color|| 29 | h.p.defaultNodeColor:h.p.defaultNodeBorderColor;b.arc(Math.round(a.displayX),Math.round(a.displayY),a.displaySize+h.p.borderSize,0,2*Math.PI,!0);b.closePath();b.fill();b.beginPath();b.fillStyle="node"==h.p.nodeHoverColor?a.color||h.p.defaultNodeColor:h.p.defaultNodeHoverColor;b.arc(Math.round(a.displayX),Math.round(a.displayY),a.displaySize,0,2*Math.PI,!0);b.closePath();b.fill();b.fillStyle="node"==h.p.labelHoverColor?a.color||h.p.defaultNodeColor:h.p.defaultLabelHoverColor;b.fillText(a.label,Math.round(a.displayX+ 30 | 1.5*a.displaySize),Math.round(a.displayY+c/2-3));return h}function o(a){if(isNaN(a.x)||isNaN(a.y))throw Error("A node's coordinate is not a number (id: "+a.id+")");return!a.hidden&&a.displayX+a.displaySize>-j/3&&a.displayX-a.displaySize<4*j/3&&a.displayY+a.displaySize>-k/3&&a.displayY-a.displaySize<4*k/3}sigma.classes.Cascade.call(this);var h=this;this.p={labelColor:"default",defaultLabelColor:"#000",labelHoverBGColor:"default",defaultHoverLabelBGColor:"#fff",labelHoverShadow:!0,labelHoverShadowColor:"#000", 31 | labelHoverColor:"default",defaultLabelHoverColor:"#000",labelActiveBGColor:"default",defaultActiveLabelBGColor:"#fff",labelActiveShadow:!0,labelActiveShadowColor:"#000",labelActiveColor:"default",defaultLabelActiveColor:"#000",labelSize:"fixed",defaultLabelSize:12,labelSizeRatio:2,labelThreshold:6,font:"Arial",hoverFont:"",activeFont:"",fontStyle:"",hoverFontStyle:"",activeFontStyle:"",edgeColor:"source",defaultEdgeColor:"#aaa",defaultEdgeType:"line",defaultNodeColor:"#aaa",nodeHoverColor:"node", 32 | defaultNodeHoverColor:"#fff",nodeActiveColor:"node",defaultNodeActiveColor:"#fff",borderSize:0,nodeBorderColor:"node",defaultNodeBorderColor:"#fff",edgesSpeed:200,nodesSpeed:200,labelsSpeed:200};var j=f,k=c;this.currentLabelIndex=this.currentNodeIndex=this.currentEdgeIndex=0;this.task_drawLabel=function(){for(var b=a.nodes.length,c=0;c++=h.p.labelThreshold|| 33 | d.forceLabel){var f="fixed"==h.p.labelSize?h.p.defaultLabelSize:h.p.labelSizeRatio*d.displaySize;l.font=h.p.fontStyle+f+"px "+h.p.font;l.fillStyle="node"==h.p.labelColor?d.color||h.p.defaultNodeColor:h.p.defaultLabelColor;l.fillText(d.label,Math.round(d.displayX+1.5*d.displaySize),Math.round(d.displayY+f/2-3))}}else h.currentLabelIndex++;return h.currentLabelIndex(b*=2)?0.5*b*b:-0.5*(--b*(b-2)-1)};sigma.tools.drawRoundRect=function(b,f,g,d,a,i,c){var i=i?i:0,j=c?c:[],j="string"==typeof j?j.split(" "):j,c=i&&(0<=j.indexOf("topleft")||0<=j.indexOf("top")||0<=j.indexOf("left")),m=i&&(0<=j.indexOf("topright")||0<=j.indexOf("top")||0<=j.indexOf("right")), 61 | h=i&&(0<=j.indexOf("bottomleft")||0<=j.indexOf("bottom")||0<=j.indexOf("left")),j=i&&(0<=j.indexOf("bottomright")||0<=j.indexOf("bottom")||0<=j.indexOf("right"));b.moveTo(f,g+i);c?b.arcTo(f,g,f+i,g,i):b.lineTo(f,g);m?(b.lineTo(f+d-i,g),b.arcTo(f+d,g,f+d,g+i,i)):b.lineTo(f+d,g);j?(b.lineTo(f+d,g+a-i),b.arcTo(f+d,g+a,f+d-i,g+a,i)):b.lineTo(f+d,g+a);h?(b.lineTo(f+i,g+a),b.arcTo(f,g+a,f,g+a-i,i)):b.lineTo(f,g+a);b.lineTo(f,g+i)};sigma.tools.getRGB=function(b,f){var b=b.toString(),g={r:0,g:0,b:0};if(3<= 62 | b.length&&"#"==b.charAt(0)){var d=b.length-1;6==d?g={r:parseInt(b.charAt(1)+b.charAt(2),16),g:parseInt(b.charAt(3)+b.charAt(4),16),b:parseInt(b.charAt(5)+b.charAt(5),16)}:3==d&&(g={r:parseInt(b.charAt(1)+b.charAt(1),16),g:parseInt(b.charAt(2)+b.charAt(2),16),b:parseInt(b.charAt(3)+b.charAt(3),16)})}f&&(g=[g.r,g.g,g.b]);return g};sigma.tools.rgbToHex=function(b,f,g){return sigma.tools.toHex(b)+sigma.tools.toHex(f)+sigma.tools.toHex(g)};sigma.tools.toHex=function(b){b=parseInt(b,10);if(isNaN(b))return"00"; 63 | b=Math.max(0,Math.min(b,255));return"0123456789ABCDEF".charAt((b-b%16)/16)+"0123456789ABCDEF".charAt(b%16)};sigma.publicPrototype=r.prototype})(); 64 | -------------------------------------------------------------------------------- /web/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | Created on Aug 7, 2011 5 | 6 | @author: alexpak 7 | ''' 8 | 9 | import cherrypy 10 | import sys 11 | import time 12 | import cgi 13 | import os 14 | 15 | path = os.path.dirname(os.path.abspath(__file__)) + '/' 16 | 17 | sys.path.append(path + '../src') 18 | 19 | f = open(path + 'html/tagging.html', 'rb') 20 | content = f.read().decode() 21 | f.close() 22 | 23 | import re 24 | import template 25 | import socket 26 | import time 27 | 28 | def recvall(sock): 29 | output = '' 30 | while True: 31 | data = sock.recv(4096) 32 | if not data: 33 | break 34 | output += data.decode('utf-8') 35 | return output 36 | 37 | import morph 38 | import re 39 | 40 | Tagger = morph.Tagger() 41 | 42 | def get_color(pos): 43 | if pos[0] == 'S': 44 | return 'blue' 45 | elif pos[0] == 'V': 46 | return 'green' 47 | elif pos[0] == 'A': 48 | return 'orange' 49 | else: 50 | return 'gray' 51 | 52 | categories = { 53 | 'S': 'сущ.', 54 | 'A': 'прил.', 55 | 'V': 'глагол', 56 | 'VINF': 'инф.', 57 | 'VADJ': 'прич.', 58 | 'VADV': 'дееп.', 59 | 'ADV': 'нар.', 60 | 'NID': 'инoстр.', 61 | 'NUM': 'числ.', 62 | 'PR': 'предлог', 63 | 'PART': 'част.', 64 | 'CONJ': 'союз', 65 | 'COM': 'ком.', 66 | 'INTJ': 'межд.', 67 | 'P': 'P', 68 | 'UNK': '???', 69 | 'm': 'муж. род', 70 | 'f': 'жен. род', 71 | 'n': 'ср. род', 72 | 'sg': 'ед. число', 73 | 'pl': 'мн. число', 74 | 'nom': 'им. падеж', 75 | 'gen': 'род. падеж', 76 | 'dat': 'дат. падеж', 77 | 'acc': 'вин. падеж', 78 | 'ins': 'твор. падеж', 79 | 'prep': 'пред. падеж', 80 | 'gen2': '2й род. падеж', 81 | 'loc': 'мест. падеж', 82 | 'anim': 'одуш.', 83 | 'inan': 'неодуш.', 84 | '1p': '1е лицо', 85 | '2p': '2е лицо', 86 | '3p': '3е лицо', 87 | 'perf': 'соверш.', 88 | 'imperf': 'несоверш.', 89 | 'real': 'действ.', 90 | 'imp': 'повелит.', 91 | 'pass': 'страд.', 92 | 'pst': 'прош. время', 93 | 'npst': 'непрош. время', 94 | 'prs': 'наст. время', 95 | 'comp': 'сравн. степень', 96 | 'supl': 'превосх. степень', 97 | 'shrt': 'кратк.' 98 | } 99 | def pos_to_human(pos): 100 | loc = [] 101 | for feat in pos.split('.'): 102 | if feat in categories: 103 | loc.append(categories[feat]) 104 | else: 105 | loc.append(feat) 106 | 107 | return loc 108 | 109 | class HelloWorld: 110 | @cherrypy.expose 111 | def index(self, text = ''): 112 | 113 | start = time.time() 114 | text = text.strip() 115 | T = template.Template() 116 | T.text = cgi.escape(text) 117 | error = '' 118 | 119 | sentence = [[w] for w in re.split('\W+', text) if len(w)] if len(text) else [] 120 | 121 | if 0 < len(sentence) < 25: 122 | 123 | labeled = Tagger.label(sentence) 124 | for w in range(0, len(sentence)): 125 | sentence[w] = (sentence[w][0], labeled[w][1], labeled[w][2]) 126 | 127 | selected_feat = {'m', 'f', 'n', 'sg', 'pl', '1p', '2p', '3p', 'nom', 'gen', 'gen2', 'dat', 'acc', 'ins', 'prep', 'loc', 'real', 'imp', 'pass', 'comp', 'shrt'} 128 | 129 | parser_input = [] 130 | for word in sentence: 131 | w = word[0] or 'FANTOM' 132 | p = '.'.join([word[1]] + sorted(word[2] & selected_feat)) 133 | parser_input.append('{0}\t{1}\n'.format(w, p)) 134 | 135 | client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 136 | client_socket.connect(("localhost", 5000)) 137 | for word in parser_input: 138 | client_socket.send(bytes(word, 'utf-8')) 139 | 140 | client_socket.send(bytes('\n', 'utf-8')) 141 | data = recvall(client_socket).strip() 142 | client_socket.close() 143 | 144 | time_total = time.time() - start 145 | words = len(sentence) 146 | words_per_sec = words / time_total 147 | 148 | edges = [] 149 | nodes = [(0, 'ROOT', 'red')] 150 | 151 | tagged = [tuple(row.split('\t')) for row in data.split('\n')] 152 | n = 0 153 | for word in tagged: 154 | n += 1 155 | if len(word) < 4: 156 | continue 157 | 158 | nodes.append((n, word[0], get_color(word[1]))) 159 | 160 | n = 0 161 | for word in tagged: 162 | n += 1 163 | if len(word) < 4: 164 | continue 165 | head = int(word[2]) 166 | if len(tagged) < head: 167 | head = 0 168 | 169 | try: 170 | edges.append((n, head, word[3])) 171 | finally: 172 | pass 173 | 174 | print(tagged, file=sys.stderr) 175 | T.tagged = [(word[0], ', '.join(pos_to_human(word[1]))) for word in tagged] 176 | T.edges = edges 177 | T.nodes = nodes 178 | T.time_total = round(time_total, 2) 179 | T.words_per_sec = round(words_per_sec) 180 | T.words = words 181 | elif len(sentence) > 25: 182 | error = 'Sentence is too long, looks like "War and Peace"' 183 | 184 | T.error = error 185 | 186 | return T.transform(content) 187 | 188 | @cherrypy.expose 189 | def test(self): 190 | return content 191 | 192 | cherrypy.server.socket_host = '0.0.0.0' 193 | config = { 194 | '/': { 195 | 'tools.staticdir.on': True, 196 | 'tools.staticdir.dir': path + 'public/', 197 | 'tools.encode.encoding': 'utf8' 198 | } 199 | } 200 | 201 | cherrypy.quickstart(HelloWorld(), config = config) 202 | --------------------------------------------------------------------------------