├── images └── chrome_Deck_contents_–_jpdb_-_httpsjpdb.io_-_Google_C_2022-03-09_16-24-16.png ├── jpdb-freq-list.user.js └── readme.md /images/chrome_Deck_contents_–_jpdb_-_httpsjpdb.io_-_Google_C_2022-03-09_16-24-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarvNC/jpdb-freq-list/6389ae1f011c2d4f99771d34733039ca4454dbf5/images/chrome_Deck_contents_–_jpdb_-_httpsjpdb.io_-_Google_C_2022-03-09_16-24-16.png -------------------------------------------------------------------------------- /jpdb-freq-list.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name JPDB Deck to frequency 3 | // @namespace https://github.com/MarvNC 4 | // @match https://jpdb.io/deck* 5 | // @match https://jpdb.io/*/vocabulary-list* 6 | // @version 1.3.1 7 | // @require https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/2.0.5/FileSaver.min.js 8 | // @require https://cdnjs.cloudflare.com/ajax/libs/jszip/3.7.1/jszip.min.js 9 | // @author Marv 10 | // @icon https://avatars.githubusercontent.com/u/17340496 11 | // @description Exports a JPDB deck to a Yomichan compatible frequency list. 12 | // ==/UserScript== 13 | 14 | let delayMs = 1200; 15 | 16 | const kanaSymbol = '㋕'; 17 | const unusedSymbol = '❌'; 18 | const hiraganaRegex = /^[\u3040-\u309F]+$/; 19 | 20 | const isHiragana = (str) => hiraganaRegex.test(str); 21 | 22 | const fileName = (deckname) => `[Freq] ${deckname}_${new Date().toISOString()}.zip`; 23 | 24 | const buildUrl = (domain, paramSymbol, sort, offset) => 25 | `${domain}${paramSymbol}sort_by=${sort}&offset=${offset}`; 26 | 27 | const defaultSort = 'by-frequency-global'; 28 | 29 | const buttonHTML = /* html */ ` 30 | `; 35 | 36 | // https://github.com/FooSoft/yomichan/blob/master/ext/data/schemas/dictionary-index-schema.json 37 | const jsonIndex = (name, sort) => { 38 | return { 39 | title: name, 40 | format: 3, 41 | revision: `JPDB_${sort}_${new Date().toISOString()}`, 42 | frequencyMode: 'rank-based', 43 | author: 'jpdb, Marv', 44 | url: 'https://jpdb.io', 45 | description: `Generated via userscript: https://github.com/MarvNC/jpdb-freq-list 46 | ${kanaSymbol} is used to indicate a frequency for a hiragana reading. 47 | ${unusedSymbol} is used to indicate that a term does not appear in the JPDB corpus.`, 48 | }; 49 | }; 50 | 51 | const entriesPerPage = 50; 52 | 53 | (async function () { 54 | const domain = document.URL.match(/.+jpdb.io\/.+(id=(\d+|\w+)|vocabulary-list)/)[0]; 55 | if (!domain) return; 56 | let paramSymbol = '&'; 57 | if (domain.includes('vocabulary-list')) { 58 | paramSymbol = '?'; 59 | } 60 | 61 | const sort = document.URL.match(/sort_by=([\w\-]+)/); 62 | const sortOrder = sort ? sort[1] : defaultSort; 63 | 64 | const browseDeckElem = [...document.querySelectorAll('div')].find( 65 | (elem) => elem.innerText === 'Browse deck' 66 | ); 67 | const deckName = 68 | browseDeckElem?.nextElementSibling?.innerText ?? document.querySelector('h4').innerText; 69 | 70 | const entriesAmountTextElem = [...document.querySelectorAll('p')].find( 71 | (elem) => elem.innerText.startsWith('Showing') && elem.innerText.endsWith('entries') 72 | ); 73 | const entriesAmount = parseInt(entriesAmountTextElem.innerText.match(/from (\d+) entries/)[1]); 74 | 75 | console.log(`${deckName} 76 | ${entriesAmount} entries 77 | Sort order: ${sortOrder}`); 78 | 79 | const button = createElementFromHTML(buttonHTML); 80 | const buttonText = button.querySelector('summary'); 81 | entriesAmountTextElem.parentNode.insertBefore(button, entriesAmountTextElem); 82 | 83 | let exporting = false; 84 | 85 | button.addEventListener('click', async () => { 86 | if (exporting) return; 87 | exporting = true; 88 | 89 | // prevent accidental closing tab 90 | window.addEventListener('beforeunload', (e) => { 91 | e.returnValue = 'Are you sure you want to stop exporting?'; 92 | }); 93 | 94 | // get terms 95 | const termEntries = {}; 96 | const usedInURLsList = []; 97 | let currentFreq = 1; 98 | const startTime = performance.now(); 99 | for (let i = 0; i < entriesAmount; i += entriesPerPage) { 100 | const assumedMsRemaining = ((entriesAmount - i) / entriesPerPage) * delayMs; 101 | const assumedMsElapsed = (i / entriesPerPage) * delayMs; 102 | const actualMsElapsed = performance.now() - startTime; 103 | let actualToPredictedRatio = actualMsElapsed / assumedMsElapsed; 104 | actualToPredictedRatio = actualToPredictedRatio ? actualToPredictedRatio : 1; 105 | const predictedMsRemaining = actualToPredictedRatio * assumedMsRemaining; 106 | 107 | buttonText.innerHTML = `${deckName}: ${entriesAmount} entries
108 | Sort: ${sortOrder}
109 | Scraping page ${Math.floor(i / entriesPerPage) + 1} of ${Math.ceil( 110 | entriesAmount / entriesPerPage 111 | )}.
112 | ${currentFreq - 1} entries scraped.
113 | ${formatMs(predictedMsRemaining)} remaining.
114 | Estimated to complete at
115 | 116 | ${new Date(Date.now() + predictedMsRemaining).toTimeString().substring(0, 8)} 117 | `; 118 | 119 | const url = buildUrl(domain, paramSymbol, sortOrder, i); 120 | const doc = await getUrl(url); 121 | const entries = [...doc.querySelectorAll('.vocabulary-list .entry .vocabulary-spelling a')]; 122 | 123 | for (const entry of entries) { 124 | usedInURLsList.push(entry.href.replace('#a', '/used-in')); 125 | const kanji = decodeURIComponent(entry.href).split('/')[5].replace('#a', ''); 126 | const entryID = entry.href.split('/')[4]; 127 | const isKana = !entry.querySelector('rt') ? isHiragana(kanji) : false; 128 | const furi = [...entry.querySelectorAll('ruby')] 129 | .map((ruby) => { 130 | if (ruby.childElementCount > 0) { 131 | return ruby.firstElementChild.innerText; 132 | } else { 133 | return ruby.innerText; 134 | } 135 | }) 136 | .join(''); 137 | 138 | const termData = { 139 | reading: furi, 140 | freq: currentFreq, 141 | isKana: isKana, 142 | }; 143 | 144 | if (!termEntries[entryID]) { 145 | termEntries[entryID] = {}; 146 | } 147 | termEntries[entryID][kanji] = termData; 148 | 149 | currentFreq++; 150 | } 151 | } 152 | 153 | // check if unused, get first unused 154 | const isUnused = async (entryNumber) => { 155 | const doc = await getUrl(usedInURLsList[entryNumber - 1]); 156 | return [...doc.querySelectorAll('p')].some((elem) => 157 | elem.innerText.includes('No matching entries were found.') 158 | ); 159 | }; 160 | buttonText.innerHTML = `Checking for unused entries.`; 161 | let firstUnused = 0; 162 | // premade vocab decks can't have unused entries 163 | if (document.URL.match('vocabulary-list') || !(await isUnused(entriesAmount))) { 164 | console.log('No unused entries.'); 165 | firstUnused = entriesAmount; 166 | } else { 167 | let top = entriesAmount; 168 | while (top - firstUnused > 1) { 169 | const mid = Math.floor((top + firstUnused) / 2); 170 | buttonText.innerHTML = `Checking for unused: ${mid}`; 171 | console.log(mid); 172 | if (await isUnused(mid)) { 173 | top = mid; 174 | } else { 175 | firstUnused = mid; 176 | } 177 | } 178 | } 179 | firstUnused++; 180 | console.log(`First unused: ${firstUnused}`); 181 | 182 | buttonText.innerHTML = `Finished scraping ${currentFreq - 1} entries, generating zip file.
183 | First unused entry: ${firstUnused}`; 184 | 185 | const freqList = []; 186 | 187 | // convert termEntries into array to export 188 | // https://github.com/FooSoft/yomichan/blob/master/ext/data/schemas/dictionary-term-meta-bank-v3-schema.json 189 | const termEntryData = (kanji, reading, freqValue, isKana = false) => { 190 | const unused = freqValue >= firstUnused; 191 | freqValue = Math.min(freqValue, firstUnused); 192 | const frequency = { 193 | value: freqValue, 194 | displayValue: freqValue + (isKana ? kanaSymbol : '') + (unused ? unusedSymbol : ''), 195 | }; 196 | let thirdValue; 197 | // third value is just the freq if it doesn't have the reading, otherwise object with reading and freq. 198 | if (kanji == reading) { 199 | thirdValue = frequency; 200 | } else { 201 | thirdValue = { 202 | reading: reading, 203 | frequency: frequency, 204 | }; 205 | } 206 | return [kanji, 'freq', thirdValue]; 207 | }; 208 | 209 | for (const entryID in termEntries) { 210 | const entry = termEntries[entryID]; 211 | for (const kanji of Object.keys(entry)) { 212 | const termData = entry[kanji]; 213 | freqList.push(termEntryData(kanji, termData.reading, termData.freq, termData.isKana)); 214 | // if the entry isn't kana, and if the reading exists, and it's used 215 | if ( 216 | kanji !== termData.reading && 217 | entry[termData.reading] && 218 | entry[termData.reading].freq < firstUnused 219 | ) { 220 | freqList.push(termEntryData(kanji, termData.reading, entry[termData.reading].freq, true)); 221 | } 222 | // for katakana versions 223 | else if (!termData.isKana && kanji === termData.reading) { 224 | const convertedHiragana = katakanaToHiragana(kanji); 225 | if ( 226 | convertedHiragana !== kanji && 227 | entry[convertedHiragana] && 228 | entry[convertedHiragana].freq < firstUnused 229 | ) { 230 | freqList.push(termEntryData(kanji, kanji, entry[convertedHiragana].freq, true)); 231 | } 232 | } 233 | } 234 | } 235 | 236 | freqList.sort((a, b) => { 237 | return (a[2].value ?? a[2].frequency?.value) - (b[2].value ?? b[2].frequency?.value); 238 | }); 239 | 240 | const exportFileName = fileName(deckName); 241 | 242 | buttonText.innerHTML = `Exporting as ${exportFileName}
243 | Total entries: ${freqList.length}
244 | Sorted by ${sortOrder}
245 | First unused entry: ${firstUnused}`; 246 | 247 | console.log(`Scraped ${freqList.length} entries`); 248 | 249 | const zip = new JSZip(); 250 | 251 | zip.file('index.json', JSON.stringify(jsonIndex(deckName, sortOrder))); 252 | zip.file('term_meta_bank_1.json', JSON.stringify(freqList)); 253 | 254 | zip 255 | .generateAsync({ 256 | type: 'blob', 257 | compression: 'DEFLATE', 258 | compressionOptions: { 259 | level: 9, 260 | }, 261 | }) 262 | .then(function (content) { 263 | saveAs(content, exportFileName); 264 | }); 265 | }); 266 | })(); 267 | 268 | function katakanaToHiragana(str) { 269 | return str.replace(/[\u30A1-\u30F6]/g, function (match) { 270 | var chr = match.charCodeAt(0) - 0x60; 271 | return String.fromCharCode(chr); 272 | }); 273 | } 274 | 275 | function createElementFromHTML(htmlString) { 276 | var div = document.createElement('div'); 277 | div.innerHTML = htmlString.trim(); 278 | return div.firstChild; 279 | } 280 | 281 | async function getUrl(url) { 282 | let response = await fetch(url); 283 | let waitMs = delayMs; 284 | await timer(waitMs); 285 | while (!response.ok) { 286 | response = await fetch(url); 287 | waitMs *= 2; 288 | delayMs *= 1.2; 289 | delayMs = Math.round(delayMs); 290 | console.log('Failed response, new wait:' + waitMs); 291 | await timer(waitMs); 292 | } 293 | const parser = new DOMParser(); 294 | return parser.parseFromString(await response.text(), 'text/html'); 295 | } 296 | 297 | function timer(ms) { 298 | return new Promise((res) => setTimeout(res, ms)); 299 | } 300 | 301 | // seconds to HH:MM:SS 302 | function formatMs(ms) { 303 | return new Date(ms).toISOString().substr(11, 8); 304 | } 305 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ### Check out these other [Yomichan dictionaries](https://github.com/MarvNC/yomichan-dictionaries/) 2 | 3 | # JPDB Frequency List 4 | 5 | 6 | > [!WARNING] 7 | > Kuuube has released a [more recent and better version of the JPDB frequency list](https://github.com/Kuuuube/yomitan-dictionaries?tab=readme-ov-file#jpdb-v21-frequency) that I recommend using instead. 8 | 9 | ### [Download](https://github.com/MarvNC/jpdb-freq-list/releases) 10 | 11 | A frequency list generated using most of the [jpdb](https://jpdb.io/) corpus can be found in the [releases](https://github.com/MarvNC/jpdb-freq-list/releases). It is not exhaustive, as there is no default deck available for the entire corpus. However it covers about 96% of the top 20,000 entries on JPDB and has over 47 万 entries. 12 | 13 | - Frequencies for hiragana versions of kanji dictionary entries will be marked by `㋕`. For example, if you hover 成る, you will see frequencies for both なる and 成る. 14 | - Frequencies for terms that do not appear at all in jpdb's corpus will be marked with `❌`. 15 | 16 | ### Note 17 | 18 | Due to the nature of how this list was generated, it can no longer be updated as jpdb now limits the total amount of entries that can be in a single deck. 19 | 20 | If you want to merge multiple frequency lists, you could try adding the media decks to your account and then use [this site](https://kampffrosch94.github.io/jpdb-deck-manager/) to merge them, before creating a frequency list. 21 | 22 | # JPDB Deck to Yomichan Frequency List Userscript 23 | 24 | ### [Install](https://github.com/MarvNC/jpdb-freq-list/raw/master/jpdb-freq-list.user.js) 25 | 26 | This userscript generates frequency lists compatible with [Yomichan](https://foosoft.net/projects/yomichan/) using [jpdb](https://jpdb.io). 27 | 28 | It is developed and tested on [Violentmonkey](https://violentmonkey.github.io/), which is the recommended way to run the script. 29 | 30 | ## Usage 31 | 32 | ![example image](./images/chrome_Deck_contents_–_jpdb_-_httpsjpdb.io_-_Google_C_2022-03-09_16-24-16.png) 33 | 34 | Simply navigate to a jpdb deck's vocabulary page, and click on the button that says "Export as frequency list." The script will use the current frequency sort setting applied to the deck page, with the default sort setting being the frequency across the whole corpus. 35 | --------------------------------------------------------------------------------