├── .gitignore ├── README.md ├── chrome ├── background.js ├── content.js ├── content_script.js ├── loader.js ├── manifest.json ├── options.html ├── options.js ├── owl_128.png ├── owl_16.png ├── owl_48.png ├── popup.html ├── popup.js ├── quick_search.js └── shared.js ├── conf └── config.json ├── docs ├── examples.md ├── icon.png ├── index.md ├── Тарзан.png ├── зачем.png ├── крупнейший.png ├── лиса.png ├── причем.png ├── продано.png ├── проще.png ├── свет.png ├── сумела.png ├── творог.png └── форматы.png └── scripts ├── Pipfile ├── build-indexes.py ├── download-pages.py ├── download-resources.py ├── package-extension.sh └── parse-pages.py /.gitignore: -------------------------------------------------------------------------------- 1 | generated/ 2 | build/ 3 | apicache*/ 4 | throttle.ctrl 5 | Pipfile.lock 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This Chrome extension adds an accent to mark the stress on words in Russian. Hovering over a words brings up a popover with its definition(s) from wiktionary. 4 | 5 | The extension contains an index of all Russian words from the English wiktionary site with all their grammatical forms and accent position. 6 | 7 | For more information and to install, visit the [extension page](https://chrome.google.com/webstore/detail/slava-russian-dictionary/bcbcmhmpbggnljoapclfcagammaapghi). 8 | 9 | # How to build 10 | 11 | ```bash 12 | 13 | pip3 install pipenv 14 | cd scripts 15 | pipenv install 16 | 17 | pipenv run python3 download-resources.py 18 | pipenv run python3 download-pages.py 19 | pipenv run python3 parse-pages.py 20 | pipenv run python3 build-indexes.py 21 | ./package-extension.sh 22 | 23 | ``` 24 | 25 | # License 26 | 27 | This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/. 28 | 29 | Icon originally by [karthikeyan](https://openclipart.org/detail/owl-by-karthikeyan), via [Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Clipart_owl.png). 30 | -------------------------------------------------------------------------------- /chrome/background.js: -------------------------------------------------------------------------------- 1 | 2 | console.log("loading dictionary data"); 3 | var forms_q = $.getJSON(chrome.extension.getURL('generated/resources/ru/forms.json')); 4 | var lemmas_q = $.getJSON(chrome.extension.getURL('generated/resources/ru/words.json')); 5 | var active_tabs = {} 6 | 7 | function load(unload) { 8 | apply_to_tab(function (tab) { 9 | if (active_tabs[tab.id]) { 10 | chrome.tabs.executeScript(null, { file: "generated/underscore.js" }); 11 | chrome.tabs.executeScript(null, { file: "generated/jquery.js" }); 12 | chrome.tabs.executeScript(null, { file: "generated/bootstrap.js" }); 13 | chrome.tabs.executeScript(null, { file: "generated/slavaConfig.js" }); 14 | chrome.tabs.executeScript(null, { file: "shared.js" }); 15 | chrome.tabs.executeScript(null, { file: "content_script.js" }); 16 | chrome.tabs.insertCSS(null, { file: "generated/bootstrap.css" }); 17 | } 18 | else if (unload) { 19 | chrome.tabs.executeScript(null, { code: "location.reload()" }); 20 | } 21 | }); 22 | } 23 | 24 | function apply_to_tab(f) { 25 | chrome.tabs.query({ 26 | "currentWindow": true, 27 | "active": true //Add any parameters you want 28 | }, function (tabs) {//It returns an array 29 | $.each(tabs, function (i, tab) { 30 | f(tab); 31 | }); 32 | }); 33 | } 34 | 35 | 36 | $.when(forms_q, lemmas_q).done(function (forms_r, lemmas_r) { 37 | console.log("loaded dictionary data"); 38 | var forms = forms_r[0]; 39 | var lemmas = lemmas_r[0]; 40 | 41 | chrome.runtime.onMessage.addListener( 42 | function (request, sender, sendResponse) { 43 | if (request.type == "resolve") { 44 | var retval = {}; 45 | $.each(request.payload, function (entry_i, entry) { 46 | var forms_for_entry = forms[entry]; 47 | if (forms_for_entry) { 48 | var return_entries = Array(); 49 | $.each(forms_for_entry, function (form_i, form) { 50 | return_entries.push([lemmas[form[0]], form[1], form[2], 0]); 51 | 52 | // e.g. шедшая -> шедший -> идти 53 | // про́ще -> простой -> простоя́ть 54 | // расчлененные -> расчленённый -> расчленить 55 | var forms_root = forms[normalize(lemmas[form[0]][0])]; 56 | if (forms_root) { 57 | $.each(forms_root, function (root_i, root) { 58 | return_entries.push([lemmas[root[0]], [], [], 1]); 59 | }); 60 | } 61 | 62 | 63 | 64 | }); 65 | retval[entry] = return_entries; 66 | } 67 | }); 68 | sendResponse({ payload: { forms: retval } }); 69 | } 70 | else if (request.type == "get-enabled") { 71 | apply_to_tab(function (tab) { 72 | sendResponse(active_tabs[tab.id]) 73 | }); 74 | return true; // mark message response as async 75 | } 76 | else if (request.type == "set-enabled") { 77 | apply_to_tab(function (tab) { 78 | active_tabs[tab.id] = request.payload 79 | }); 80 | load(true); 81 | 82 | } 83 | else if (request.type == "load") { 84 | load(false); 85 | } 86 | else if (request.type == "set-language_pref") { 87 | chrome.storage.sync.set({ 'language_pref': request.payload }); 88 | } 89 | else if (request.type == "get-language_pref") { 90 | chrome.storage.sync.get('language_pref', function (response) { 91 | language_pref = response.language_pref || []; 92 | for (var key in slavaConfig.wiktionary) { 93 | if (!language_pref.includes(key)) { 94 | language_pref.push(key); 95 | } 96 | } 97 | sendResponse(language_pref); 98 | }); 99 | return true; // mark message response as async 100 | } 101 | }); 102 | 103 | }); 104 | -------------------------------------------------------------------------------- /chrome/content.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | $(document).ready(mark_words) 5 | 6 | })(); //outer function 7 | -------------------------------------------------------------------------------- /chrome/content_script.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | var entityMap = { 5 | '&': '&', 6 | '<': '<', 7 | '>': '>', 8 | '"': '"', 9 | "'": ''', 10 | '/': '/', 11 | '`': '`', 12 | '=': '=', 13 | }; 14 | 15 | // Regexp for matching Russian words 16 | var re = /[А-яЁё\-\u0301]+/g; 17 | 18 | // parse document without loading images. See https://stackoverflow.com/questions/15113910 19 | var virtualDocument = document.implementation.createHTMLDocument('virtual'); 20 | 21 | function escapeHtml(string) { 22 | return String(string).replace(/[&<>"'`=\/]/g, function (s) { 23 | return entityMap[s]; 24 | }); 25 | } 26 | 27 | function getTextNodesIn(node, includeWhitespaceNodes) { 28 | var textNodes = [], nonWhitespaceMatcher = /\S/; 29 | 30 | function getTextNodes(node) { 31 | if (node.nodeType == Node.TEXT_NODE) { 32 | if (includeWhitespaceNodes || nonWhitespaceMatcher.test(node.nodeValue)) { 33 | textNodes.push(node); 34 | } 35 | } else if (!["SCRIPT", "STYLE", "NOSCRIPT"].includes(node.nodeName)) { 36 | for (var i = 0, len = node.childNodes.length; i < len; ++i) { 37 | getTextNodes(node.childNodes[i]); 38 | } 39 | } 40 | } 41 | 42 | getTextNodes(node); 43 | return textNodes; 44 | } 45 | 46 | function parse_table(table) { 47 | var rows = table.children('tbody').children('tr'); 48 | var t = []; 49 | for (var i = 0; i < rows.length; i++) { 50 | var r = []; 51 | var row = rows[i]; 52 | var td = $(row).children('td,th'); 53 | for (var j = 0; j < td.length; j++) { 54 | var c = td.get(j); 55 | // apply colspan 56 | for (var j2 = 0; j2 < c.colSpan; j2++) { 57 | r.push([c, c.rowSpan, $(c).text()]); 58 | } 59 | } 60 | t.push(r); 61 | } 62 | for (var i = 0; i < t.length; i++) { 63 | var r = t[i]; 64 | for (var j = 0; j < r.length; j++) { 65 | var c = r[j]; 66 | var c0 = $(c[0]); 67 | // apply rowspan 68 | if (c[1] > 1) { 69 | t[i + 1].splice(j, 0, [c[0], c[1] - 1, c[2]]); 70 | } 71 | // remove span e.g. animate / inanimate in Владимир 72 | if (c[0].tagName == 'TH') { 73 | c0.children('span[style]').remove(); 74 | } 75 | } 76 | } 77 | return t; 78 | } 79 | 80 | function add_grammar(grammar, element) { 81 | var text = $(element).text().trim(); 82 | if (!grammar.includes(text)) { 83 | grammar.push(text); 84 | } 85 | } 86 | 87 | function grammar_from_table(table, element, cases) { 88 | var t = parse_table(table); 89 | 90 | for (var i = 0; i < t.length; i++) { 91 | for (var j = 0; j < t[i].length; j++) { 92 | var c = t[i][j]; 93 | //multiple elements can match because of colspan 94 | if (c[0] != element) { continue; } 95 | 96 | var grammar_tokens = []; 97 | var in_th = false; 98 | for (var i2 = i - 1; i2 >= 0; i2--) { 99 | if (t[i2][j][0].tagName == 'TH') { 100 | add_grammar(grammar_tokens, t[i2][j][0]); 101 | in_th = true; 102 | } 103 | else if (in_th) { 104 | break; 105 | } 106 | } 107 | in_th = false; 108 | for (var j2 = j - 1; j2 >= 0; j2--) { 109 | if (t[i][j2][0].tagName == 'TH') { 110 | add_grammar(grammar_tokens, t[i][j2][0]); 111 | in_th = true; 112 | } 113 | else if (in_th) { 114 | break; 115 | } 116 | } 117 | if (grammar_tokens) { 118 | var grammarText = grammar_tokens.reverse().join(" "); 119 | if (!cases.includes(grammarText)) { 120 | cases.push(grammarText); 121 | } 122 | } 123 | } 124 | } 125 | } 126 | 127 | 128 | function genCharArray(charA, charZ) { 129 | var a = [], i = charA.charCodeAt(0), j = charZ.charCodeAt(0); 130 | for (; i <= j; ++i) { 131 | a.push(String.fromCharCode(i)); 132 | } 133 | return a.join(""); 134 | } 135 | 136 | function xpath_list(jquery_elements, expr) { 137 | 138 | var nodes = []; 139 | $.each(jquery_elements.get(), function (i, e) { 140 | var iterator = document.evaluate(expr, e, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null); 141 | var thisNode = iterator.iterateNext(); 142 | while (thisNode) { 143 | nodes.push(thisNode); 144 | thisNode = iterator.iterateNext(); 145 | } 146 | }); 147 | return nodes; 148 | } 149 | 150 | function parse_wiki(dom, word, lemma, freq, src_lang, lang_pair) { 151 | var page_url = 'https://' + src_lang + '.wiktionary.org/wiki/' + lemma; 152 | var lang_span_id = lang_pair.lang_span_id; // FIXME may be _1 153 | var lang_conf = slavaConfig.wiktionary[src_lang]; 154 | var language_heading = lang_conf.language_heading; 155 | var langspan = dom.find(language_heading + " > span#" + lang_span_id + ".mw-headline"); 156 | var langsection = langspan.parent().nextUntil(language_heading); 157 | 158 | var wordClasses = _.object(_.map(lang_conf.definition_headings, function (v) { return [v, 1]; })); 159 | //Words may have multiple classes, e.g. под 160 | //Will be h3, or h4 if multiple etymologies, e.g. погрузиться 161 | var wordClassHeadings = langsection.find("span.mw-headline").filter(function () { return wordClasses[$(this).text().trim()]; }); 162 | 163 | // Add word class within definition (since word class heading is removed) 164 | // Add frequency within definition 165 | if (freq) { 166 | var freq_span = ' ' + freq + ''; 167 | wordClassHeadings.each(function () { 168 | var s = $('' + $(this).text() + ''); 169 | if (lang_conf.heading_is_class) { 170 | $(this).parent().next().children(':first-child').after(s).after(' '); 171 | s.after(freq_span); 172 | } 173 | else { 174 | $(this).parent().next().prepend(freq_span); 175 | } 176 | }); 177 | } 178 | 179 | 180 | var defn = wordClassHeadings.parent().nextUntil('hr,h1,h2,h3,h4,h5'); // e.g. with hr: после 181 | var full_def = wordClassHeadings.parent().nextUntil(wordClassHeadings.prop('tagName')); 182 | 183 | 184 | 185 | var upper = genCharArray('A', 'Z') + genCharArray('А', 'Я') + 'Ë'; 186 | var lower = upper.toLowerCase(); 187 | var expr1 = '//td/span[@lang="ru"]'; 188 | var expr2s = ['', '/a']; // свое́й under свой is once not full content of the cell 189 | var expr3 = '[translate(.,"' + upper + UNICODE_COMBINING_ACUTE_ACCENT + '", "' + lower + '")=translate("' + escapeHtml(word) + '","' + upper + UNICODE_COMBINING_ACUTE_ACCENT + '", "' + lower + '")]/ancestor::td[1]'; 190 | 191 | var cases = []; 192 | $.each(expr2s, function (i, expr2) { 193 | var nodes = xpath_list(full_def, expr1 + expr2 + expr3); 194 | 195 | $.each(nodes, function (j, element) { 196 | grammar_from_table($(element).closest('table'), element, cases); 197 | }); 198 | }); 199 | 200 | var comparatives = xpath_list(full_def, "b[@lang='ru' and preceding-sibling::*[1][name()='i' and text()='comparative']]"); 201 | $.each(comparatives, function (i, element) { 202 | var comparative = element.textContent; 203 | var prefix = "по"; 204 | var test; 205 | if (comparative.startsWith("(" + prefix + ")")) { 206 | comparative = comparative.slice(prefix.length + 2); 207 | test = [comparative, prefix + comparative]; 208 | } 209 | else { 210 | test = [comparative]; 211 | } 212 | for (var i = 0; i < test.length; i++) { 213 | if (test[i] == word) { 214 | cases.push("comparative"); 215 | break; 216 | } 217 | } 218 | }); 219 | 220 | defn = defn.filter(":not(table.flextable)"); 221 | 222 | // Remove transliterations 223 | 224 | defn.find("a[title='Wiktionary:Russian transliteration']").remove(); 225 | 226 | var translit = defn.find("span.tr, i.tr"); //e.g. with : свет 227 | 228 | // Remove parentheses / dashes before / after transliteration 229 | $.each(translit.get(), function (i, e) { 230 | var prev = e.previousSibling; 231 | var next = e.nextSibling; 232 | if (prev && next) { 233 | if (prev.textContent.slice(-1) == "(" && next.textContent.slice(0, 1) == ")") { 234 | prev.textContent = prev.textContent.slice(0, -1); 235 | next.textContent = next.textContent.slice(1); 236 | } 237 | if (prev.textContent.trim() == "―") { //e.g. свет 238 | prev.textContent = ""; 239 | } 240 | if (next.textContent.slice(0, 2) == ", ") { //e.g. погрузиться 241 | next.textContent = next.textContent.slice(2); 242 | } 243 | } 244 | }); 245 | 246 | 247 | translit.remove(); 248 | 249 | 250 | // Add hyperlink to original wiktionary page 251 | // NB e.g. не#Prefix has no headword 252 | var page_link = document.createElement('a'); 253 | page_link.href = page_url; 254 | var headword = defn.find("strong.headword"); 255 | if (headword.length) { 256 | headword.wrap(page_link); 257 | } else { 258 | page_link.innerText = lemma; 259 | defn.prepend('').prepend(page_link); 260 | } 261 | 262 | 263 | // Change relative hyperlinks to absolute 264 | var page_base = page_link.protocol + "//" + page_link.host; // e.g. "https://en.wiktionary.org" 265 | defn.find('a:not([href*="://"],[href^="mailto:"])').each(function () { 266 | $(this).attr('href', function (index, value) { 267 | if (!value) { 268 | return value; 269 | } 270 | if (value.slice(0, 1) == "#") { 271 | return null; 272 | } 273 | if (value.slice(0, 1) == "/") { 274 | return page_base + value; 275 | } 276 | return page_base + page_link.path + value; 277 | }); 278 | 279 | return defn; 280 | }); 281 | 282 | // Remove images 283 | defn.find('img').remove(); 284 | 285 | // Add cases 286 | var casesdiv = $(""); 287 | $.each(cases, function (i, e) { 288 | var casediv = $(""); 289 | casediv.append(e); 290 | casesdiv.append(casediv); 291 | }); 292 | 293 | // Build output structure 294 | var res = $(""); 295 | res.append(defn); 296 | res.append(casesdiv); 297 | return res; 298 | 299 | } 300 | 301 | 302 | function generate_popup(target, lemmas, langs) { 303 | document.body.style.cursor = "progress"; 304 | get_entries(target, lemmas, langs, function (target, items) { 305 | if (!target.attr("data-popover_on")) { 306 | return; 307 | } 308 | var odom = $(''); 309 | $.each(items, function () { 310 | odom.append($(this)); 311 | }); 312 | document.body.style.cursor = "auto"; 313 | 314 | var placement = 'bottom'; 315 | if ((target.offset().top - $(window).scrollTop()) / window.innerHeight > .5) 316 | placement = 'top'; 317 | 318 | target.popover({ 319 | trigger: 'manual', 320 | content: odom, 321 | container: 'body', 322 | placement: placement, 323 | html: true 324 | }); 325 | target.popover("show"); 326 | }); 327 | } 328 | 329 | function get_entries(target, lemmas, langs, callback) { 330 | var src_lang = langs[0]; 331 | var word = target.text(); 332 | var target_lang = 'ru'; 333 | var lang_pair = slavaConfig.langpairs[src_lang][target_lang]; 334 | var ajax_queries = $.map(_.keys(lemmas), function (lemma) { 335 | var url = 'https://' + src_lang + '.wiktionary.org/w/api.php?action=parse&format=json&page=' + lemma + '&prop=text&origin=*'; 336 | return $.getJSON(url); 337 | }); 338 | 339 | $.when.apply($, ajax_queries).done(function () { 340 | var odom = Array(); 341 | 342 | var res = arguments; 343 | if (ajax_queries.length < 2) { 344 | res = [arguments]; 345 | } 346 | $.each(res, function (i, a1) { 347 | var parsed = a1[0].parse; 348 | if (parsed) { 349 | var html = parsed.text['*']; 350 | var dom = $(html, virtualDocument); 351 | var freq = lemmas[parsed.title]; 352 | dom = parse_wiki(dom, word, parsed.title, freq, src_lang, lang_pair); 353 | if (dom.children().children().length) { 354 | odom.push(dom); 355 | } 356 | 357 | } 358 | }); 359 | if (odom.length || langs.length <= 1) { 360 | callback(target, odom); 361 | } 362 | else { 363 | get_entries(target, lemmas, langs.slice(1), callback); 364 | } 365 | }); 366 | 367 | } 368 | 369 | function gen_popover(item) { 370 | var lemmas = JSON.parse(item.attr("data-lemmas")); 371 | if (lemmas) { 372 | chrome.runtime.sendMessage({ type: "get-language_pref" }, function (response) { 373 | 374 | if (response) { 375 | generate_popup(item, lemmas, response); 376 | } 377 | else { 378 | console.log("No response to get-language_pref"); 379 | } 380 | }); 381 | } 382 | } 383 | 384 | function slava_mouseenter(event) { 385 | $(".popover").css("display", "none"); 386 | event.target.setAttribute("data-popover_on", "1"); 387 | setTimeout(function () { 388 | if (!event.target.getAttribute("data-popover_on")) { 389 | return; 390 | } 391 | gen_popover($(event.target)); 392 | 393 | }, 100); 394 | 395 | } 396 | 397 | function slava_mouseleave(event) { 398 | event.target.removeAttribute("data-popover_on"); 399 | setTimeout(function () { $(event.target).popover("hide"); }, 10000); 400 | } 401 | 402 | function mark_word(word_item, callback) { 403 | var t1 = word_item.text(); 404 | 405 | var allWords = Array(); 406 | var match; 407 | while (match = re.exec(t1)) { 408 | allWords.push(normalize(match[0])); 409 | } 410 | 411 | chrome.runtime.sendMessage({ type: "resolve", payload: _.unique(allWords) }, function (response) { 412 | var forms = response.payload.forms; 413 | 414 | var str = t1.replace(re, function (match, group) { 415 | var normalized_word = normalize(match); 416 | var ref = match; 417 | var lemmasf = {}; 418 | if (forms[normalized_word]) { 419 | var entry0 = forms[normalized_word]; 420 | var stress_chars = Array(); 421 | var spellings = {}; 422 | $.each(entry0, function (i, entry) { 423 | var lemma_entry = entry[0]; 424 | stress_chars = stress_chars.concat(entry[1]); 425 | lemmasf[lemma_entry[0]] = lemma_entry[1]; 426 | var is_derived = entry[3]; 427 | if (!is_derived) { 428 | var spelling = entry[2].length ? entry[2][0] : normalized_word; 429 | spellings[spelling] = 1; 430 | } 431 | }); 432 | var matchn = match.replace(UNICODE_COMBINING_ACUTE_ACCENT, ''); 433 | if (!spellings || spellings[matchn]) { ref = matchn } 434 | else { 435 | ref = _.keys(spellings)[0]; 436 | 437 | // match capitalization 438 | if (match[0].toLowerCase() != match[0]) { 439 | if (match.length > 1 && match[1].toLowerCase() != match[1]) { 440 | ref = ref.toUpperCase(); 441 | } 442 | else { 443 | ref = ref.charAt(0).toUpperCase() + ref.slice(1); 444 | } 445 | } 446 | } 447 | 448 | // mark stress with accent character 449 | if (stress_chars) { 450 | var stress_pos = _.uniq(stress_chars).sort(); 451 | var chars = ref; 452 | var accented = ""; 453 | var s_pos = 0; 454 | $.each(stress_pos, function (i, stress_char) { 455 | accented += chars.slice(s_pos, stress_char) + UNICODE_COMBINING_ACUTE_ACCENT; 456 | s_pos = stress_char; 457 | }); 458 | accented += chars.slice(s_pos); 459 | ref = accented; 460 | } 461 | } 462 | else { 463 | if (match.length > 3 && match[0] === match[0].toLowerCase()) { 464 | console.log("[Slava] No match:" + match); 465 | } 466 | lemmasf[match] = 0; 467 | } 468 | var slemmas = JSON.stringify(lemmasf); 469 | return "" + '' + ref + ''; 470 | }); // replace 471 | str = "" + str + ""; 472 | var span = $(str); 473 | word_item.replaceWith(span); 474 | if (callback) 475 | callback(span); 476 | }); 477 | } 478 | 479 | function mark_words() { 480 | 481 | 482 | $('head').append(''); 483 | 484 | var v = getTextNodesIn(document.body); 485 | 486 | $.each(v, function () { 487 | mark_word($(this), null); 488 | }); 489 | 490 | 491 | } 492 | 493 | 494 | $(document).ready(mark_words) 495 | 496 | $("body").on("mouseenter", ".slava-pop", slava_mouseenter); 497 | $("body").on("mouseleave", ".slava-pop", slava_mouseleave); 498 | 499 | $("body").append(' '); 500 | 501 | $('#slava-try').on("input", function () { 502 | $('#slava-try-res').text($(this).val()); 503 | $.each($('#slava-try-res').contents(), function () { 504 | mark_word($(this), function (obj) { 505 | var target = obj.filter('.slava-pop'); 506 | target.attr("data-popover_on", "1"); 507 | gen_popover(target); 508 | } 509 | ); 510 | }); 511 | }); 512 | 513 | })(); //outer function 514 | -------------------------------------------------------------------------------- /chrome/loader.js: -------------------------------------------------------------------------------- 1 | function load_slava() { 2 | chrome.runtime.sendMessage({ type: "load" }); 3 | } 4 | load_slava(); 5 | -------------------------------------------------------------------------------- /chrome/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "name": "Slava Russian Dictionary", 4 | "description": "This extension writes accents on Russian words, and adds a popup to each word with its English definition.", 5 | "version": "0.0.0.4", 6 | "browser_action": { 7 | "default_title": "Enrich text with accents and dictionary popups", 8 | "default_icon": "owl_16.png", 9 | "default_popup": "popup.html" 10 | }, 11 | "options_ui": { 12 | "page": "options.html", 13 | "chrome_style": true 14 | }, 15 | "icons": { 16 | "16": "owl_16.png", 17 | "48": "owl_48.png", 18 | "128": "owl_128.png" 19 | }, 20 | "permissions": [ 21 | "storage", 22 | "activeTab", 23 | "" 24 | ], 25 | "background": { 26 | "scripts": [ 27 | "generated/jquery.js", 28 | "generated/slavaConfig.js", 29 | "shared.js", 30 | "background.js" 31 | ], 32 | "persistent": false 33 | }, 34 | "content_scripts": [ 35 | { 36 | "matches": [""], 37 | "js": ["loader.js"] 38 | } 39 | ], 40 | "web_accessible_resources": [ 41 | "generated/resources/*" 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /chrome/options.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 19 | 20 | 21 | 22 | 23 | 24 | Slava Translator 25 | Preference order of definition language (drag to change) 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /chrome/options.js: -------------------------------------------------------------------------------- 1 | document.addEventListener('DOMContentLoaded', () => { 2 | 3 | chrome.runtime.sendMessage({ type: "get-language_pref" }, function (lang_pref) { 4 | var langs = $('#slava-langs'); 5 | $.each(lang_pref, function (i, lang) { 6 | text = slavaConfig.wiktionary[lang].name; 7 | $('').appendTo(langs).text(text).attr("data-lang", lang); 8 | }); 9 | 10 | var sortable = Sortable.create(langs.get(0), { 11 | animation: 150, 12 | onSort: function (evt) { 13 | langs = $.map($(evt.to).children('div'), function (el) { 14 | return $(el).attr("data-lang"); 15 | }); 16 | chrome.runtime.sendMessage({ type: "set-language_pref", payload: langs }); 17 | } 18 | }); 19 | 20 | }); 21 | 22 | }) 23 | -------------------------------------------------------------------------------- /chrome/owl_128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/chrome/owl_128.png -------------------------------------------------------------------------------- /chrome/owl_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/chrome/owl_16.png -------------------------------------------------------------------------------- /chrome/owl_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/chrome/owl_48.png -------------------------------------------------------------------------------- /chrome/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 30 | 31 | 32 | 33 | 34 | Slava Translator 35 | 36 | 37 | Enable for this tab 39 | 40 | Page reloaded with Slava disabled. 41 | 42 | 43 | 44 | Options 45 | Search word 46 | 47 | 48 | -------------------------------------------------------------------------------- /chrome/popup.js: -------------------------------------------------------------------------------- 1 | document.addEventListener('DOMContentLoaded', () => { 2 | chrome.runtime.sendMessage({ type: "get-enabled" }, function (response) { 3 | if (response) { 4 | $('#slava-enable').bootstrapToggle('on'); 5 | $('#go-to-search').css('visibility', 'visible'); 6 | } 7 | $('#slava-enable').change(function () { 8 | var checked = $(this).prop('checked'); 9 | chrome.runtime.sendMessage({ type: "set-enabled", payload: checked }); 10 | $('#slava-disable-reload').css('visibility', checked ? 'hidden' : 'visible'); 11 | $('#go-to-search').css('visibility', (!checked) ? 'hidden' : 'visible'); 12 | 13 | }); 14 | }); 15 | 16 | $('#go-to-options').click(function () { 17 | chrome.runtime.openOptionsPage(); 18 | }); 19 | 20 | $('#go-to-search').click(function () { 21 | chrome.tabs.executeScript(null, { file: "quick_search.js" }); 22 | }); 23 | }) 24 | -------------------------------------------------------------------------------- /chrome/quick_search.js: -------------------------------------------------------------------------------- 1 | document.getElementById("slava-quick-input").style.visibility = "visible"; 2 | -------------------------------------------------------------------------------- /chrome/shared.js: -------------------------------------------------------------------------------- 1 | 2 | // Unicode COMBINING ACUTE ACCENT character, used to mark stress on Russian words 3 | UNICODE_COMBINING_ACUTE_ACCENT = '\u0301'; 4 | 5 | console.log("loading shared"); 6 | function normalize(str) { 7 | str = str.replace(UNICODE_COMBINING_ACUTE_ACCENT, ''); 8 | str = str.toLowerCase(); 9 | str = str.replace('ё', 'е'); 10 | return str; 11 | } 12 | -------------------------------------------------------------------------------- /conf/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "langpairs": { 3 | "en": { 4 | "ru": { 5 | "lang_span_name": "Russian", 6 | "lang_span_id": "Russian", 7 | "include": [ 8 | { 9 | "category": "Russian lemmas", 10 | "recurse": 1 11 | }, 12 | { 13 | "category": "Russian proper nouns" 14 | }, 15 | { 16 | "category": "Russian participles" 17 | }, 18 | { 19 | "category": "Russian adjective superlative forms" 20 | } 21 | ], 22 | "exclude": [ 23 | { 24 | "category": "Russian spellings with е instead of ё" 25 | }, 26 | { 27 | "category": "Russian phrases" 28 | }, 29 | { 30 | "category": "Russian proverbs" 31 | }, 32 | { 33 | "category": "Russian obsolete forms" 34 | } 35 | ] 36 | } 37 | }, 38 | "ru": { 39 | "ru": { 40 | "lang_span_name": "Русский", 41 | "lang_span_id": "Русский" 42 | } 43 | }, 44 | "fr": { 45 | "ru": { 46 | "lang_span_name": "Russe", 47 | "lang_span_id": "Russe" 48 | } 49 | } 50 | }, 51 | "languages": { 52 | "ru": { 53 | "frequency_file": "https://github.com/Baksalyar/mc.hertzbeat.ru-Frequency-Dictionaries/raw/master/mc.hertzbeat.ru_frequency_dict.txt" 54 | } 55 | }, 56 | "wiktionary": { 57 | "en": { 58 | "name": "English", 59 | "language_heading": "h2", 60 | "heading_is_class": true, 61 | "definition_headings": [ 62 | "Circumfix", 63 | "Interfix", 64 | "Prefix", 65 | "Affix", 66 | "Suffix", 67 | "Abbreviation", 68 | "Adjective", 69 | "Adverb", 70 | "Conjunction", 71 | "Combining form", 72 | "Diacritical mark", 73 | "Determiner", 74 | "Interjection", 75 | "Idiom", 76 | "Morpheme", 77 | "Letter", 78 | "Noun", 79 | "Numeral", 80 | "Particle", 81 | "Participle", 82 | "Phrase", 83 | "Predicative", 84 | "Preposition", 85 | "Prepositional phrase", 86 | "Pronoun", 87 | "Proper noun", 88 | "Proverb", 89 | "Symbol", 90 | "Verb" 91 | ] 92 | }, 93 | "ru": { 94 | "name": "Russian (русский)", 95 | "language_heading": "h1", 96 | "heading_is_class": false, 97 | "definition_headings": [ 98 | "Значение" 99 | ] 100 | }, 101 | "fr": { 102 | "name": "French (français)", 103 | "language_heading": "h2", 104 | "heading_is_class": true, 105 | "definition_headings": [ 106 | "Circonfixe", 107 | "Interfixe", 108 | "Préfixe", 109 | "Affixe", 110 | "Suffixe", 111 | "Abréviation", 112 | "Adjectif", 113 | "Adverbe", 114 | "Conjonction", 115 | "Déterminant", 116 | "Interjection", 117 | "Idiome", 118 | "Morphème", 119 | "Lettre", 120 | "Nom commun", 121 | "Adjectif numéral", 122 | "Particule", 123 | "Participe", 124 | "Phrase", 125 | "Prédicatif", 126 | "Préposition", 127 | "Pronom", 128 | "Nom propre", 129 | "Proverbe", 130 | "Symbole", 131 | "Verbe" 132 | ] 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | | Example word | Example | Feature | 2 | | --- | --- | --- | 3 | | зачем |  | Marks accent position in the original word. When cursor is moved over the word, pops up a definition including word base form (here *заче́м*), class (here *Adverb*), frequency rank (here *1624*, meaning it is the 1624th most commonly used word) and a definition. Click on the word base form to navigate to the Wiktionary entry. The frequency rank is useful for language learners, who should focus on learning first the most common 1000 words, then the most common 5000 etc. | 4 | | свет |  | Removes Latin transcription of Russian text from Wiktionary entries. | 5 | | причем, четырехсот |  | Restores *ё* letter in original word when spelled *е*. | 6 | | форматы |  | For nouns, indicates the gender (here *m*=male) and kind (here *inan*=inanimate). In the popup, indicates the declension(s) matched by the original word (here *genitive singular* and *nominative plural*). | 7 | | сумела |  | For verbs, indicates the aspect (here *pf*=perfective) and gives the paired verb (here *уме́ть*). In the popup, indicates the conjugation(s) matched by the original word (here *feminine (я/ты/она́) singular*). | 8 | | продано, расчлененные |  | Recognizes declined forms of verb participles. | 9 | | проще, попроще |  | Recognizes comparative forms. | 10 | | крупнейший, крупнейшего |  | Recognizes superlative forms. | 11 | | творог, свекла |  | Marks all accents where several accent positions are accepted. The reader must stress only one of the positions. | 12 | | лиса |  | Marks all accents where several accent positions are possible depending on grammar. The reader must stress the correct position based on grammar. | 13 | | Тарзан |  | When a word is missing from English Wiktionary, displays entry from Russian Wiktionary instead. The order of language preferences is configurable in the extension options. | 14 | -------------------------------------------------------------------------------- /docs/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/icon.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This Chrome extension adds an accent to mark the stress on words in Russian. Hovering over a words brings up a popover with its definition(s) from wiktionary. 4 | 5 | The extension contains an index of all Russian words from the English wiktionary site with all their grammatical forms and accent position. 6 | 7 | # How to use 8 | 9 | After the extension is installed, click on this icon in the Chrome toolbar: 10 | 11 |  12 | 13 | Toggle the switch to enable the extension for the current tab. 14 | 15 | # Details 16 | 17 | See the [examples](examples.md) page for more information. 18 | 19 | 20 | ### Reporting Issues and Feedback 21 | 22 | If you encounter any bugs, please file an issue in the [Issues](https://github.com/algattik/SlavaTranslator/issues) section of our GitHub repo. 23 | 24 | # More information 25 | 26 | Source code and license information can be found at [our GitHub repository](https://github.com/algattik/SlavaTranslator/). 27 | -------------------------------------------------------------------------------- /docs/Тарзан.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/Тарзан.png -------------------------------------------------------------------------------- /docs/зачем.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/зачем.png -------------------------------------------------------------------------------- /docs/крупнейший.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/крупнейший.png -------------------------------------------------------------------------------- /docs/лиса.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/лиса.png -------------------------------------------------------------------------------- /docs/причем.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/причем.png -------------------------------------------------------------------------------- /docs/продано.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/продано.png -------------------------------------------------------------------------------- /docs/проще.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/проще.png -------------------------------------------------------------------------------- /docs/свет.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/свет.png -------------------------------------------------------------------------------- /docs/сумела.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/сумела.png -------------------------------------------------------------------------------- /docs/творог.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/творог.png -------------------------------------------------------------------------------- /docs/форматы.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algattik/SlavaTranslator/61eb2eed5c6cc3ee29805662606f63ffd3672b46/docs/форматы.png -------------------------------------------------------------------------------- /scripts/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | 3 | url = "https://pypi.python.org/simple" 4 | verify_ssl = true 5 | name = "pypi" 6 | 7 | 8 | [packages] 9 | 10 | pywikibot = "*" 11 | lxml = "*" 12 | progressbar2 = "*" 13 | 14 | 15 | [dev-packages] 16 | 17 | -------------------------------------------------------------------------------- /scripts/build-indexes.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | from progressbar import progressbar 4 | from collections import defaultdict 5 | 6 | parsed_top_dir = Path("../build/parsed") 7 | index_top_dir = Path("../build/index") 8 | resources_dir = Path("../build/resources") 9 | 10 | config = json.load(open("../conf/config.json")) 11 | 12 | # ranking, adapted from https://stackoverflow.com/a/30801799 13 | 14 | def rank_simple(vector, reverse): 15 | return sorted(range(len(vector)), key=vector.__getitem__, reverse=reverse) 16 | 17 | def rankdata(a, method='average', reverse=False): 18 | n = len(a) 19 | ivec=rank_simple(a, reverse) 20 | svec=[a[rank] for rank in ivec] 21 | sumranks = 0 22 | dupcount = 0 23 | newarray = [0]*n 24 | for i in range(n): 25 | sumranks += i 26 | dupcount += 1 27 | if i==n-1 or svec[i] != svec[i+1]: 28 | for j in range(i-dupcount+1,i+1): 29 | if method=='average': 30 | averank = sumranks / float(dupcount) + 1 31 | newarray[ivec[j]] = averank 32 | elif method=='max': 33 | newarray[ivec[j]] = i+1 34 | elif method=='min': 35 | newarray[ivec[j]] = i+1 -dupcount+1 36 | else: 37 | raise NameError('Unsupported method') 38 | 39 | sumranks = 0 40 | dupcount = 0 41 | 42 | 43 | return newarray 44 | 45 | 46 | for src_lang, targets in config["langpairs"].items(): 47 | for target_lang, langpair in targets.items(): 48 | 49 | print("%s => %s" % (src_lang, target_lang)) 50 | index_dir=Path(index_top_dir, src_lang, target_lang) 51 | parsed_dir=Path(parsed_top_dir, src_lang, target_lang) 52 | index_dir.mkdir(parents=True, exist_ok=True) 53 | words = dict() 54 | forms = defaultdict(lambda : defaultdict(lambda : [set(), set()])) 55 | 56 | freqfile = Path(resources_dir, target_lang + ".freq.txt") 57 | freq2 = defaultdict(lambda : 0) 58 | with open(freqfile) as p: 59 | for line in p: 60 | (form, count) = line.rstrip('\n').split(' ') 61 | c = int(count) 62 | freq2[form] = freq2[form] + c 63 | 64 | print("Listing files...") 65 | files = sorted(parsed_dir.glob('*.dat')) 66 | 67 | print("Parsing files...") 68 | word_counter = 0 69 | for parsed in progressbar(files): 70 | with open(parsed) as p: 71 | for line in p: 72 | s = line.rstrip('\n') 73 | (declined, base, stress, canonical) = s.split('\t') 74 | if not base in words: 75 | words[base] = [word_counter, 0] 76 | word_counter = word_counter + 1 77 | b = words[base] 78 | word_i = b[0] 79 | if stress != "0": 80 | forms[declined][word_i][0].add(int(stress)) 81 | forms[declined][word_i][1].add(canonical) 82 | if declined in freq2: 83 | b[1] = b[1] + freq2[declined] 84 | 85 | print("Assembling words...") 86 | words_arr = sorted(list(words.items()), key = lambda e: e[1][0]) 87 | 88 | print("Computing frequency ranks...") 89 | freq_ranks = rankdata([f[1] for f in words.values()], method='min', reverse=True) 90 | words_with_freq = list(zip([w[0] for w in words_arr], freq_ranks)) 91 | 92 | print("Assembling forms...") 93 | for declined, d in progressbar(forms.items()): 94 | words_new = [] 95 | for word_i, entry in d.items(): 96 | (stresses, canonicals) = entry 97 | if len(canonicals) == 1 and next(iter(canonicals)) == declined: 98 | canonicals = [] 99 | words_new.append([word_i, list(stresses), list(canonicals)]) 100 | forms[declined] = words_new 101 | 102 | print("Writing output in [%s]..." % index_dir) 103 | with open(Path(index_dir, "words.json"), "w") as f: 104 | json.dump(words_with_freq, f, ensure_ascii=False, separators=(',', ':')) 105 | with open(Path(index_dir, "forms.json"), "w") as f: 106 | json.dump(forms, f, ensure_ascii=False, separators=(',', ':')) 107 | 108 | print("Completed.") 109 | -------------------------------------------------------------------------------- /scripts/download-pages.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | import os 4 | import re 5 | from collections import defaultdict 6 | import progressbar 7 | 8 | os.environ['PYWIKIBOT2_NO_USER_CONFIG']='1' 9 | import pywikibot 10 | from pywikibot import pagegenerators 11 | 12 | download_dir = Path("../build/download") 13 | config = json.load(open("../conf/config.json")) 14 | 15 | includes = defaultdict(list) 16 | excludes = defaultdict(list) 17 | for src_lang, targets in config["langpairs"].items(): 18 | for target_lang, langpair in targets.items(): 19 | if "include" in langpair: 20 | includes[src_lang].extend(langpair["include"]) 21 | if "exclude" in langpair: 22 | excludes[src_lang].extend(langpair["exclude"]) 23 | 24 | def toHex(x): 25 | return "".join([hex(ord(c))[2:].zfill(4) for c in x]) 26 | 27 | def download_cat(site, cat, callback): 28 | catName = cat["category"] 29 | recurse = cat["recurse"] if "recurse" in cat else None 30 | category = pywikibot.Category(site, catName) 31 | bar = progressbar.ProgressBar(max_value=category.categoryinfo['pages']) 32 | count = 0 33 | for page in pagegenerators.CategorizedPageGenerator(category, recurse=recurse, namespaces="0"): 34 | count = count + 1 35 | bar.update(count) 36 | callback(page) 37 | bar.finish() 38 | 39 | 40 | for src_lang, incl in includes.items(): 41 | print("Language: %s" % src_lang) 42 | site = pywikibot.Site(code=src_lang, fam='wiktionary') 43 | 44 | download_lang_dir = Path(download_dir, src_lang) 45 | download_lang_dir.mkdir(parents=True, exist_ok=True) 46 | 47 | excluded_pages = set() 48 | for excluded_cat in excludes[src_lang]: 49 | print("Excluding pages from category [%s]" % excluded_cat['category']) 50 | download_cat(site, excluded_cat, lambda page: excluded_pages.add(page.title())) 51 | 52 | for included_cat in incl: 53 | 54 | def download_page(page): 55 | title = page.title() 56 | 57 | if title in excluded_pages: 58 | return 59 | if len(title) > 63: 60 | return 61 | 62 | fileName = toHex(title) 63 | my_file = Path(download_lang_dir, fileName + ".json") 64 | if my_file.is_file(): 65 | return 66 | 67 | html = site.get_parsed_page(title) 68 | html = re.sub("", "", html, flags=re.DOTALL) 69 | 70 | my_file.write_text(json.dumps({'title':title, 'text':page.text, 'html':html}, ensure_ascii=False), 'utf-8') 71 | 72 | print("Including pages from category [%s]" % included_cat['category']) 73 | download_cat(site, included_cat, download_page) 74 | 75 | -------------------------------------------------------------------------------- /scripts/download-resources.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | import requests 4 | 5 | config = json.load(open("../conf/config.json")) 6 | 7 | download_dir = Path("../build/resources") 8 | download_dir.mkdir(parents=True, exist_ok=True) 9 | 10 | for lang, conf in config["languages"].items(): 11 | r = requests.get(conf["frequency_file"]) 12 | file = Path(download_dir, lang + ".freq.txt") 13 | print(conf["frequency_file"]) 14 | with open(file, 'wb') as f: 15 | f.write(r.content) 16 | -------------------------------------------------------------------------------- /scripts/package-extension.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -euo pipefail 3 | mkdir -p ../chrome/generated/resources/ru 4 | cd ../chrome 5 | cp ../build/index/en/ru/words.json generated/resources/ru 6 | cp ../build/index/en/ru/forms.json generated/resources/ru 7 | wget -qO generated/underscore.js "https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js" 8 | wget -qO generated/jquery.js "https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js" 9 | wget -qO generated/bootstrap.js "https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" 10 | wget -qO generated/bootstrap.css "https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" 11 | wget -qO generated/bootstrap-toggle.css "https://gitcdn.github.io/bootstrap-toggle/2.2.2/css/bootstrap-toggle.min.css" 12 | wget -qO generated/bootstrap-toggle.js "https://gitcdn.github.io/bootstrap-toggle/2.2.2/js/bootstrap-toggle.min.js" 13 | wget -qO generated/Sortable.js "https://cdnjs.cloudflare.com/ajax/libs/Sortable/1.6.0/Sortable.min.js" 14 | (echo "slavaConfig = "; cat ../conf/config.json) > generated/slavaConfig.js 15 | rm -f ../build/slava-package.zip && zip -r ../build/slava-package.zip . 16 | echo Downloads complete. 17 | -------------------------------------------------------------------------------- /scripts/parse-pages.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | import unicodedata 3 | from pathlib import Path 4 | from progressbar import progressbar 5 | import json 6 | 7 | config = json.load(open("../conf/config.json")) 8 | 9 | download_dir = Path("../build/download") 10 | parsed_dir = Path("../build/parsed") 11 | normalize_char_map = {'ё':'е', 'Ё':'Е'} 12 | vowels = 'аэыуояеёюи' 13 | 14 | def normalize_string(s): 15 | norms = unicodedata.normalize('NFC', s) 16 | noacc = [c 17 | for c in norms 18 | if unicodedata.category(c) != 'Mn' #'Mark, Nonspacing' = accents 19 | and ( 20 | not unicodedata.category(c).startswith('P') #Punctuation 21 | or c == '-' 22 | )] 23 | normalized = ''.join(normalize_char_map[c] if c in normalize_char_map else c 24 | for c in noacc).lower() 25 | stress=None 26 | if vowel_count(normalized) > 1: 27 | for p, c in enumerate(norms): 28 | if unicodedata.category(c) == 'Mn': # 'Mark, Nonspacing' = accents 29 | stress = p 30 | break 31 | 32 | return tuple([normalized, stress, ''.join(noacc)]) 33 | 34 | def add_norm(forms, html, xpath, optional_prefix=None, prefix_norm=None): 35 | matches = html.xpath(xpath) 36 | for match in matches: 37 | match_text = match.xpath("string(.)") 38 | 39 | #Cell may contain multiple forms, e.g. свой -> свое́й, свое́ю (https://en.wiktionary.org/wiki/%D1%81%D0%B2%D0%BE%D0%B9) 40 | for match_form in match_text.split(", "): 41 | 42 | # Add comparative with and without comparative suffix, e.g. попроще and проще 43 | if optional_prefix and match_form.startswith(optional_prefix): 44 | suffix = match_form[len(optional_prefix):] 45 | forms.add(normalize_string(prefix_norm + suffix)) 46 | forms.add(normalize_string(suffix)) 47 | else: 48 | forms.add(normalize_string(match_form)) 49 | 50 | def vowel_count(txt): 51 | count = 0 52 | txt = txt.lower() 53 | for vowel in vowels: 54 | count = count + txt.count(vowel) 55 | return count 56 | 57 | def parse_file(f, src_lang, destdir): 58 | 59 | pageJson=json.load(open(str(f))) 60 | of=pageJson['html'] 61 | title = pageJson['title'] 62 | html = etree.fromstring(of) 63 | 64 | for target_lang, langpair in config["langpairs"][src_lang].items(): 65 | lang_name = langpair["lang_span_name"] 66 | langs = html.xpath("//h2/span[text()='%s' and contains(@class,'mw-headline')]" % lang_name) 67 | if not langs: #does not work for Serbo-Croatian 68 | continue 69 | forms = set() 70 | 71 | span_selector = "//*[preceding-sibling::h2[1]/span[text()='%s']]" % lang_name 72 | td_selector_template = "%s//table[contains(@class,'inflection-table')]/%s/tr/td//span[@lang='%s']" 73 | for tbody_selector in ['tbody', '.']: 74 | td_selector = td_selector_template % (span_selector, tbody_selector, target_lang) 75 | add_norm(forms, html, td_selector) 76 | add_norm(forms, html, "%s//strong[contains(@class,'headword') and @lang='%s']" % (span_selector, target_lang)) 77 | 78 | # Parse comparative. NB тёплый has two variants 79 | comp_select = "//b[@lang='%s' and preceding-sibling::*[name()='i' and text()='comparative']]" % target_lang 80 | add_norm(forms, html, comp_select, "(по)", "по") 81 | 82 | dir = Path(destdir, target_lang) 83 | dir.mkdir(parents=True, exist_ok=True) 84 | file = Path(dir, Path(f).with_suffix('.dat').name) 85 | s = ''.join(["%s\t%s\t%s\t%s\n" % (form[0], title, form[1] if form[1] else 0, form[2]) for form in forms]) 86 | file.write_text(s, encoding='utf8') 87 | 88 | marker.write_bytes(b'') 89 | 90 | 91 | for src_lang, targets in config["langpairs"].items(): 92 | lang_dir = Path(download_dir, src_lang) 93 | 94 | destdir = Path(parsed_dir, src_lang) 95 | marker_dir = Path(destdir, "_done") 96 | marker_dir.mkdir(parents=True, exist_ok=True) 97 | 98 | print("Source language: [%s]" % src_lang) 99 | print("Listing files...") 100 | files = sorted(lang_dir.glob("*.json")) 101 | 102 | new_pages = 0 103 | 104 | print("Parsing files...") 105 | for f in progressbar(files): 106 | 107 | marker = Path(marker_dir, Path(f).name) 108 | if marker.is_file(): 109 | continue 110 | 111 | new_pages = new_pages + 1 112 | 113 | parse_file(f, src_lang, destdir) 114 | 115 | print("Parsed %d new pages out of %d total pages." % (new_pages, len(files))) 116 | 117 | --------------------------------------------------------------------------------