├── .editorconfig ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── PyMultiDictionary ├── __init__.py ├── _dictionary.py ├── _goslate.py ├── _tokenizer.py ├── _utils.py └── version.py ├── README.rst ├── build.py ├── codecov.yml ├── requirements.txt ├── setup.py └── test ├── __init__.py ├── data ├── educalingo_en_good.txt ├── mw_en_good.txt ├── synonyms_en_bad.txt ├── synonyms_en_good.txt ├── synonyms_en_not-bad.txt └── thesaurus-for-this-reason.txt ├── test_dictionary.py └── test_utils.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://EditorConfig.org 2 | 3 | # Top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines 7 | [*] 8 | charset = utf-8 9 | end_of_line = lf 10 | insert_final_newline = false 11 | 12 | # Configure languages 13 | [*.py] 14 | indent_size = 4 15 | indent_style = space 16 | 17 | [{*.json, *.yml}] 18 | indent_size = 2 19 | indent_style = space -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: ppizarror 4 | patreon: # Patreon 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: ppizarror 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Environment information** 11 | Describe your environment information, such as: 12 | 13 | - SO: win/linux 14 | - python version: v3.x 15 | - pygame version: v2.x 16 | - pygame-menu version: v3.x.x 17 | 18 | **Describe the bug** 19 | A clear and concise description of what the bug is. 20 | 21 | **To Reproduce** 22 | Please provide a **minimal** reproducible example that developers can run to investigate the problem. 23 | You can find help for creating such an example [here](https://stackoverflow.com/help/minimal-reproducible-example). 24 | 25 | **Expected behavior** 26 | A clear and concise description of what you expected to happen. 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | concurrency: 4 | cancel-in-progress: true 5 | group: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref }} 6 | 7 | on: 8 | push: 9 | branches: 10 | - master 11 | pull_request: 12 | branches: 13 | - master 14 | 15 | jobs: 16 | test: 17 | uses: ppizarror/workflow-actions/.github/workflows/test_python.yml@master 18 | strategy: 19 | matrix: 20 | python: [ 3.8, 3.9, '3.10', '3.11', '3.12' ] 21 | with: 22 | install-extras: test 23 | os: ubuntu-latest 24 | python-version: ${{ matrix.python}} 25 | 26 | codeql: 27 | uses: ppizarror/workflow-actions/.github/workflows/codeql.yml@master 28 | with: 29 | language: python -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | ._* 3 | .idea/ 4 | .vscode/ 5 | 6 | # Build 7 | build/ 8 | dist/ 9 | **.egg-info 10 | 11 | # Test 12 | test/data/**_copy.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Pablo Pizarro R. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude *.bat 2 | exclude *.yml 3 | exclude .github/ISSUE_TEMPLATE/* 4 | exclude .gitignore 5 | exclude .replit 6 | exclude docs/* 7 | exclude docs/_source/* 8 | exclude docs/_static/* 9 | exclude test/*.py 10 | 11 | include requirements.txt -------------------------------------------------------------------------------- /PyMultiDictionary/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | PyMultiDictionary is a Dictionary Module for Python 2 to get meanings, translations, synonyms and antonyms of words in 20 different languages. 6 | """ 7 | 8 | try: 9 | from PyMultiDictionary._dictionary import * 10 | except ModuleNotFoundError: 11 | pass 12 | import PyMultiDictionary.version 13 | 14 | __author__ = 'Pablo Pizarro R.' 15 | __copyright__ = 'Copyright 2021 Pablo Pizarro R. @ppizarror' 16 | __description__ = 'PyMultiDictionary is a Dictionary Module for Python 2 to get meanings, translations, synonyms and antonyms of words in 20 different languages' 17 | __email__ = 'pablo@ppizarror.com' 18 | __keywords__ = 'dictionary multi-language synonym antonym definition' 19 | __license__ = 'MIT' 20 | __module_name__ = 'pymultidictionary' 21 | __url__ = 'https://github.com/ppizarror/PyMultiDictionary' 22 | __url_bug_tracker__ = 'https://github.com/ppizarror/PyMultiDictionary' 23 | __url_documentation__ = 'https://github.com/ppizarror/PyMultiDictionary' 24 | __url_source_code__ = 'https://github.com/ppizarror/PyMultiDictionary' 25 | __version__ = PyMultiDictionary.version.ver 26 | -------------------------------------------------------------------------------- /PyMultiDictionary/_dictionary.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | DICTIONARY 6 | Dictionary object. 7 | """ 8 | 9 | __all__ = [ 10 | 'DICT_EDUCALINGO', 11 | 'DICT_SYNONYMCOM', 12 | 'DICT_THESAURUS', 13 | 'DICT_MW', 14 | 'MultiDictionary' 15 | ] 16 | 17 | import json 18 | import requests 19 | 20 | import PyMultiDictionary._goslate as goslate 21 | import PyMultiDictionary._utils as ut 22 | import urllib.error 23 | import urllib.parse 24 | 25 | from bs4 import BeautifulSoup # type: ignore 26 | from urllib.request import urlopen, Request 27 | from typing import Dict, Tuple, Optional, List, Union 28 | from warnings import warn 29 | 30 | # Dicts 31 | _EDUCALINGO_LANGS = ('bn', 'de', 'en', 'es', 'fr', 'hi', 'it', 'ja', 'jv', 'ko', 'mr', 32 | 'ms', 'pl', 'pt', 'ro', 'ru', 'ta', 'tr', 'uk', 'zh') 33 | 34 | DICT_EDUCALINGO: str = 'educalingo' 35 | DICT_MW: str = 'Merriam-Webster' 36 | DICT_SYNONYMCOM: str = 'synonym' 37 | DICT_THESAURUS: str = 'thesaurus' 38 | 39 | # URL header 40 | _HEADER: Dict[str, str] = { 41 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36' 42 | } 43 | 44 | # Cache 45 | _CACHED_SOUPS: Dict[str, 'BeautifulSoup'] = {} # Stores cached web 46 | 47 | # Types 48 | AntonymType = List[str] 49 | SynonymType = List[str] 50 | TranslationType = Union[List[Tuple[str, str]], List[str]] 51 | MeaningType = Union[Dict[str, List[str]], Tuple[List[str], str, str]] 52 | 53 | 54 | class MultiDictionary(object): 55 | """ 56 | Dictionary. Support synonyms, antonyms, meanings, and translations from some languages. 57 | """ 58 | _langs: Dict[str, Tuple[bool, bool, bool, bool]] # synonyms, meaning, translation, antonym 59 | _max_cached_websites: int # Maximum stored websites 60 | _test_cached_file: Dict[str, str] # If defined, loads that file instead 61 | _tokenize: bool # Enables word tokenizer 62 | _words: List[str] # List of words passed to the constructor 63 | _words_lang: str # Language of the words passed to the constructor 64 | 65 | def __init__(self, *words: Tuple[str, ...]) -> None: 66 | """ 67 | Constructor. 68 | 69 | :param words: List of words 70 | """ 71 | self._langs = { # iso 639 codes 72 | 'bn': (True, True, True, False), 73 | 'de': (True, True, True, False), 74 | 'en': (True, True, True, True), 75 | 'es': (True, True, True, False), 76 | 'fr': (True, True, True, False), 77 | 'hi': (True, True, True, False), 78 | 'it': (True, True, True, False), 79 | 'ja': (True, True, True, False), 80 | 'jv': (True, True, True, False), 81 | 'ko': (True, True, True, False), 82 | 'mr': (True, True, True, False), 83 | 'ms': (True, True, True, False), 84 | 'pl': (True, True, True, False), 85 | 'pt': (True, True, True, False), 86 | 'ro': (True, True, True, False), 87 | 'ru': (True, True, True, False), 88 | 'ta': (True, True, True, False), 89 | 'tr': (True, True, True, False), 90 | 'uk': (True, True, True, False), 91 | 'zh': (True, True, True, False) 92 | } 93 | self._max_cached_websites = 15 94 | self._test_cached_file = {} 95 | self._tokenize = True 96 | self._words = [] 97 | self._words_lang = '' 98 | for w in words: 99 | # noinspection PyTypeChecker 100 | w = self._process(w) 101 | if w != '' and w not in self._words: 102 | self._words.append(w) 103 | 104 | def set_words_lang(self, lang) -> None: 105 | """ 106 | Set words lang passed to the Dictionary. 107 | 108 | :param lang: Language of the words 109 | """ 110 | assert lang in self._langs.keys(), f'{lang} is not supported' 111 | self._words_lang = lang 112 | 113 | def _process(self, word: str) -> str: 114 | """ 115 | Process a given word. 116 | 117 | :param word: Word 118 | :return: Word without invalid chars 119 | """ 120 | assert isinstance(word, str), 'word must be a string' 121 | s: str = ''.join(i for i in word if not i.isdigit()) # remove numbers 122 | if self._tokenize: # tokenize 123 | s = ut.tokenize(s) 124 | s = s.lower() # lowercase 125 | s = s.replace('\n', '') # remove spaces 126 | return s.strip() 127 | 128 | def _bsoup(self, link: str, encoding: str = 'utf-8') -> Optional['BeautifulSoup']: 129 | """ 130 | Returns a parsed web. 131 | 132 | :param link: Link 133 | :param encoding: Web encoding 134 | :return: Parsed web. None if error 135 | """ 136 | bs_keys: List[str] = list(_CACHED_SOUPS.keys()) 137 | if link in bs_keys: 138 | return _CACHED_SOUPS[link] 139 | if link in self._test_cached_file.keys(): 140 | with open(self._test_cached_file[link], encoding='utf8') as f: 141 | data = ''.join(f.readlines()) 142 | else: 143 | try: 144 | data = self.__request(link, encoding) 145 | except (urllib.error.HTTPError, ValueError): 146 | return None 147 | bs: 'BeautifulSoup' = BeautifulSoup(data, 'html.parser') 148 | _CACHED_SOUPS[link] = bs 149 | if len(bs_keys) >= self._max_cached_websites: 150 | # noinspection PyTypeChecker 151 | del _CACHED_SOUPS[bs_keys[0]] 152 | return bs 153 | 154 | @staticmethod 155 | def __request(link: str, encoding: str) -> str: 156 | """ 157 | Attempt a request. 158 | 159 | :param link: Link 160 | :param encoding: Encoding 161 | :return: Content 162 | """ 163 | # noinspection PyBroadException 164 | try: 165 | response = requests.get(link, headers=_HEADER) 166 | response.raise_for_status() 167 | return response.text 168 | except Exception: 169 | req = Request(link, headers=_HEADER) 170 | try: 171 | import ssl 172 | return str(urlopen(req, context=ssl.SSLContext()).read().decode(encoding)) 173 | except ImportError: 174 | return str(urlopen(req).read().decode(encoding)) 175 | 176 | def _save_bsoup(self, link: str, filename: str, encoding: str = 'utf-8') -> None: 177 | """ 178 | Save bsoup to file. 179 | 180 | :param link: Load soup link 181 | :param filename: Output file 182 | :param encoding: Website encoding 183 | """ 184 | bs: 'BeautifulSoup' = self._bsoup(link, encoding) 185 | with open(filename, 'w', encoding='utf8') as out: 186 | out.write(str(bs.prettify())) 187 | 188 | def __check_defined_lang(self) -> None: 189 | """ 190 | Checks the lang has been defined. 191 | """ 192 | if self._words_lang == '': 193 | raise DictionaryLangNotDefined( 194 | 'dictionary lang have not been defined yet, call dictionary.set_words_lang(lang) first') 195 | 196 | def __synonym_com(self, word: str, _type: str) -> SynonymType: 197 | """ 198 | Retrieves synonyms from synonym.com. 199 | 200 | :param word: Word 201 | :param _type: Type (synonym, antonym) 202 | :return: Word list 203 | """ 204 | assert _type in ('Synonyms', 'Antonyms') 205 | word = word.replace(' ', '-') 206 | bs = self._bsoup(f'https://www.synonym.com/synonyms/{word}') 207 | if bs is None: 208 | return [] 209 | results = bs.find_all('div', {'class': 'section'}) 210 | en_words: SynonymType = [] 211 | for section in results: # Iterate each section 212 | title = section.find_all('h3', {'class': 'section-title'}) 213 | if len(title) == 0: 214 | continue 215 | title = title[0].text 216 | if '.' not in title or 'Quotes containing' in title or 'Words that' in title or 'Example sententes' in title: 217 | continue 218 | for subsection in section.find_all('div', {'class': 'section-list-wrapper'}): 219 | section_type = subsection.find_all('h4', {'class': 'section-list-header'}) 220 | if len(section_type) != 1: 221 | continue 222 | section_type = section_type[0].text.strip() 223 | if section_type != _type: 224 | continue 225 | sectionlist = subsection.find_all('ul', {'class': 'section-list'}) 226 | if len(sectionlist) != 1: 227 | continue 228 | sectionlist = sectionlist[0] 229 | if 'href' not in str(sectionlist): # Not links, but words 230 | for w in sectionlist.find_all('li'): 231 | wr: str = w.text.strip() 232 | if '(' not in wr and wr not in en_words: # Avoid onld english 233 | en_words.append(wr) 234 | else: 235 | for w in sectionlist.find_all('a'): 236 | wr: str = w.text.strip() 237 | if '(' not in wr and wr not in en_words: # Avoid onld english 238 | en_words.append(wr) 239 | return en_words 240 | 241 | def synonym(self, lang: str, word: str, dictionary: str = DICT_EDUCALINGO) -> SynonymType: 242 | """ 243 | Find the synonyms for a given word. 244 | 245 | :param lang: Lang code 246 | :param word: Word to retrieve 247 | :param dictionary: Dictionary to retrieve the synonyms 248 | :return: Synonyms list 249 | """ 250 | words: SynonymType = [] 251 | word = self._process(word) 252 | lang = lang.lower() 253 | 254 | assert dictionary in (DICT_EDUCALINGO, DICT_SYNONYMCOM, DICT_THESAURUS), 'Unsupported dictionary' 255 | if lang not in self._langs.keys() or not self._langs[lang][0]: 256 | raise InvalidLangCode(f'{lang} code is not supported for synonyms') 257 | elif word == '': 258 | return words 259 | 260 | elif dictionary == DICT_EDUCALINGO and lang in _EDUCALINGO_LANGS: 261 | bs = self.__search_educalingo(lang, word=word.replace(' ', '-')) 262 | if bs is None: 263 | return words 264 | results = [i for i in bs.find_all('div', {'class': 'contenido_sinonimos_antonimos0'})] 265 | if len(results) > 0: 266 | results = results[0] 267 | else: 268 | return words 269 | for j in results.find_all('a'): 270 | words.append(j.get('title').strip()) 271 | 272 | elif dictionary == DICT_SYNONYMCOM and lang == 'en': 273 | en_words = self.__synonym_com(word, 'Synonyms') 274 | for w in en_words: 275 | if w not in words: 276 | words.append(w) 277 | 278 | elif dictionary == DICT_THESAURUS and lang == 'en': 279 | word = word.replace(' ', '%20') 280 | bs = self._bsoup(f'https://www.thesaurus.com/browse/{word}') 281 | if bs is None: 282 | return words 283 | results = [i for i in bs.find_all('section', {'data-type': 'synonym-antonym-module'})] 284 | if len(results) == 1: 285 | results = results[0] 286 | for li in results.find_all('li'): 287 | sw = li.text.strip() 288 | if sw not in words: 289 | words.append(sw) 290 | 291 | else: 292 | raise InvalidDictionary(f'Dictionary {dictionary} cannot handle language {lang}') 293 | 294 | return words 295 | 296 | def get_synonyms(self, dictionary: str = DICT_EDUCALINGO) -> List[SynonymType]: 297 | """ 298 | Get the synonyms for all words of the dictionary. 299 | 300 | :param dictionary: Dictionary to retrieve the synonyms 301 | :return: Synonyms list 302 | """ 303 | self.__check_defined_lang() 304 | return [self.synonym(self._words_lang, w, dictionary) for w in self._words] 305 | 306 | def antonym(self, lang: str, word: str, dictionary: str = DICT_SYNONYMCOM) -> AntonymType: 307 | """ 308 | Finds an aynonym for a given word. 309 | 310 | :param lang: Lang code 311 | :param word: Word to retrieve 312 | :param dictionary: Dictionary to retrieve the antonyms 313 | :return: Synonyms list 314 | """ 315 | words: AntonymType = [] 316 | word = self._process(word) 317 | 318 | assert dictionary in DICT_SYNONYMCOM, 'Unsupported dictionary' 319 | if lang not in self._langs.keys() or not self._langs[lang][3]: 320 | raise InvalidLangCode(f'{lang} code is not supported for antonyms') 321 | elif word == '': 322 | return words 323 | 324 | elif dictionary == DICT_SYNONYMCOM and lang == 'en': 325 | en_words = self.__synonym_com(word, 'Antonyms') 326 | for w in en_words: 327 | if w not in words: 328 | words.append(w) 329 | 330 | return words 331 | 332 | def get_antonyms(self, dictionary: str = DICT_SYNONYMCOM) -> List[AntonymType]: 333 | """ 334 | Get the antonyms for all words of the dictionary. 335 | 336 | :param dictionary: Dictionary to retrieve the antonyms 337 | :return: Antonyms list 338 | """ 339 | self.__check_defined_lang() 340 | return [self.antonym(self._words_lang, w, dictionary) for w in self._words] 341 | 342 | def __search_educalingo(self, lang: str, word: str) -> Optional['BeautifulSoup']: 343 | """ 344 | Searches word for educalingo. 345 | 346 | :param lang: Language 347 | :param word: Word to search for 348 | :return: Search word content 349 | """ 350 | bs = self._bsoup(f'https://educalingo.com/en/dic-{lang}/{word}') 351 | if bs is None: # If failed, search word 352 | try: 353 | word = urllib.parse.quote_plus(word) 354 | r = json.loads(self.__request(f'https://search.educalingo.com/?dic={lang}&q={word}', 'utf-8')) 355 | if 'palabras' in r and len(r['palabras']) > 0: 356 | word = r['palabras'][0]['url'] 357 | return self._bsoup(f'https://educalingo.com/en/dic-{lang}/{word}') 358 | except (json.JSONDecodeError, KeyError): 359 | pass 360 | return bs 361 | 362 | def meaning(self, lang: str, word: str, dictionary: str = DICT_EDUCALINGO) -> MeaningType: 363 | """ 364 | Finds the meaning for a given word. 365 | 366 | :param lang: Lang code 367 | :param word: Word to retrieve 368 | :param dictionary: Dictionary to retrieve the meanings 369 | :return: Meaning 370 | """ 371 | types, words, wiki = [], '', '' 372 | word = self._process(word) 373 | 374 | assert dictionary in (DICT_EDUCALINGO, DICT_MW), 'Unsupported dictionary' 375 | if lang not in self._langs.keys() or not self._langs[lang][1]: 376 | raise InvalidLangCode(f'{lang} code is not supported for meanings') 377 | elif word == '': 378 | return types, words, wiki 379 | 380 | elif dictionary == DICT_EDUCALINGO and lang in _EDUCALINGO_LANGS: 381 | bs = self.__search_educalingo(lang, word=word.replace(' ', '-')) 382 | if bs is not None: 383 | results = [i for i in bs.find_all('div', {'id': 'cuadro_categoria_gramatical'})] 384 | if len(results) == 1: 385 | results = results[0] 386 | for j in results.find_all('div', {'class': 'categoria_gramatical'}): 387 | divj = j.find_all('div', {'class': 'circulo_categoria_gramatical'}) 388 | if len(divj) == 1: 389 | divcls = divj[0].get('class') 390 | if 'background_gris' not in divcls: 391 | typej = j.find_all('div', {'class': 'texto_pie_categoria_gramatical'}) 392 | if len(typej) == 1: 393 | t = typej[0].text.strip().capitalize() 394 | if t != '': 395 | types.append(t) 396 | 397 | # Definition 398 | results = [i for i in bs.find_all('div', {'id': 'significado_de'})] 399 | if len(results) > 0: 400 | words = results[0].text.strip().replace('\n', '') 401 | 402 | # Wikipedia 403 | results = [i for i in bs.find_all('span', {'id': 'wiki_introduccion'})] 404 | if len(results) > 0: 405 | wiki = results[0].text.strip().replace('\n', '') 406 | 407 | return types, words, wiki 408 | 409 | elif dictionary == DICT_MW and lang == 'en': 410 | if not word.strip(): 411 | return {} 412 | bs = self._bsoup(f'https://www.merriam-webster.com/dictionary/{word}') 413 | if bs is None: 414 | return {} 415 | 416 | definitions: MeaningType = {} 417 | for pos_tag in bs.find_all('h2', class_='parts-of-speech'): 418 | part_of_speech = pos_tag.get_text(strip=True) 419 | 420 | if part_of_speech in definitions: 421 | continue 422 | 423 | definitions[part_of_speech] = [] 424 | definition_section = pos_tag.find_next('div', class_='vg') 425 | if not definition_section: 426 | continue 427 | 428 | for sense in definition_section.find_all('div', class_='sb'): 429 | definition_texts = sense.find_all('span', class_='dtText') 430 | for def_text in definition_texts: 431 | definition = def_text.get_text().lstrip(": ") 432 | if definition: 433 | definitions[part_of_speech].append(definition) 434 | 435 | return definitions 436 | 437 | raise InvalidDictionary(f'Dictionary {dictionary} cannot handle language {lang}') 438 | 439 | def get_meanings(self, dictionary: str = DICT_EDUCALINGO) -> List[MeaningType]: 440 | """ 441 | Get the ameanings for all words of the dictionary. 442 | 443 | :param dictionary: Dictionary to retrieve the meanings 444 | :return: Meanings list 445 | """ 446 | self.__check_defined_lang() 447 | return [self.meaning(self._words_lang, w, dictionary) for w in self._words] 448 | 449 | def translate(self, lang: str, word: str, to: str = '', dictionary: str = DICT_EDUCALINGO) -> TranslationType: 450 | """ 451 | Translate a word. 452 | 453 | :param lang: Lang tag (ISO 639) 454 | :param word: Word to translate 455 | :param to: Target language (Google API) 456 | :param dictionary: Dictionary to retrieve the translations if ``to`` is empty 457 | :return: List of (Lang tag, translated word) 458 | """ 459 | assert isinstance(lang, str), 'lang code must be a string' 460 | assert isinstance(to, str), 'to lang code must be a string' 461 | words: TranslationType = [] 462 | word = self._process(word) 463 | 464 | assert dictionary in DICT_EDUCALINGO, 'Unsupported dictionary' 465 | if to != '': 466 | gs = goslate.Goslate() 467 | try: 468 | return [(to, gs.translate(word, to, lang))] 469 | except (urllib.error.HTTPError, IndexError) as e: 470 | warn(f'{word} cannot be translated to {to}-language as Google API is not available. Error: {e}') 471 | 472 | if lang not in self._langs.keys() or not self._langs[lang][2]: 473 | raise InvalidLangCode(f'{lang} code is not supported for translation') 474 | 475 | elif lang in _EDUCALINGO_LANGS: 476 | bs = self.__search_educalingo(lang, word=word.replace(' ', '-')) 477 | if bs is None: 478 | return words 479 | results = [i for i in bs.find_all('div', {'class': 'traduccion0'})] 480 | if len(results) == 0: 481 | return words 482 | for j in results: 483 | lang_tag = j.get('id') 484 | # noinspection PyTypeChecker 485 | lang_name = j.find_all('h4', {'class', 'traductor'}) 486 | if len(lang_name) != 1: 487 | continue 488 | lang_name = lang_name[0].find_all('strong', {}) 489 | if len(lang_name) != 1: 490 | continue 491 | 492 | # Find non-links 493 | lang_nonlink = j.find_all('span', {'class': 'negro'}) 494 | if len(lang_nonlink) == 1: 495 | words.append((lang_tag, lang_nonlink[0].text.strip())) 496 | continue 497 | 498 | # Find links 499 | lang_link = j.find_all('strong', {}) 500 | if len(lang_link) != 2: 501 | continue 502 | lang_link = lang_link[1].find_all('a', {}) 503 | if len(lang_link) == 1: 504 | words.append((lang_tag, lang_link[0].text.strip())) 505 | 506 | # Sort translations 507 | words = sorted(words, key=lambda x: x[0]) 508 | 509 | return words 510 | 511 | def get_translations(self, to: str = '', dictionary: str = DICT_EDUCALINGO) -> List[TranslationType]: 512 | """ 513 | Get the translations for all words of the dictionary. 514 | 515 | :param to: Target language (Google API) 516 | :param dictionary: Dictionary to retrieve the translations if ``to`` is empty 517 | :return: Translations list 518 | """ 519 | self.__check_defined_lang() 520 | return [self.translate(self._words_lang, w, to, dictionary) for w in self._words] 521 | 522 | @staticmethod 523 | def get_language_name(lang: str, lang_out: str = '') -> str: 524 | """ 525 | Returns the name of a language. 526 | 527 | :param lang: Language tag (ISO 639) 528 | :param lang_out: Target language (ISO 639). If not supported, will return the English name 529 | :return: Language name from tag 530 | """ 531 | return ut.get_language_name(lang, lang_out) 532 | 533 | 534 | class DictionaryLangNotDefined(Exception): 535 | """ 536 | Dictionary lang not defined. 537 | """ 538 | 539 | 540 | class InvalidLangCode(Exception): 541 | """ 542 | Invalid lang. 543 | """ 544 | 545 | 546 | class InvalidDictionary(Exception): 547 | """ 548 | Invalid dictionary. 549 | """ 550 | -------------------------------------------------------------------------------- /PyMultiDictionary/_goslate.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | GOSLATE 6 | Goslate module. 7 | https://github.com/yeahwhat-mc/goslate 8 | """ 9 | 10 | __all__ = ['Goslate'] 11 | 12 | import sys 13 | import json 14 | import itertools 15 | import functools 16 | import time 17 | import socket 18 | import random 19 | import re 20 | 21 | from urllib.request import build_opener, Request, HTTPHandler 22 | from urllib.parse import quote_plus, urlencode, unquote_plus, urljoin 23 | 24 | izip = zip 25 | 26 | try: 27 | # noinspection PyUnresolvedReferences 28 | import concurrent.futures 29 | 30 | _g_executor = concurrent.futures.ThreadPoolExecutor(max_workers=120) 31 | except ImportError: 32 | _g_executor = None 33 | 34 | __author__ = 'ZHUO Qiang' 35 | __email__ = 'zhuo.qiang@gmail.com' 36 | __copyright__ = "2013, https://zhuoqiang.me" 37 | __license__ = "MIT" 38 | __date__ = '2013-05-11' 39 | __version_info__ = (1, 4, 0) 40 | __version__ = '.'.join(str(i) for i in __version_info__) 41 | __home__ = 'https://bitbucket.org/zhuoqiang/goslate' 42 | __download__ = 'https://pypi.python.org/pypi/goslate' 43 | 44 | 45 | def _is_sequence(arg): 46 | return (not isinstance(arg, str)) and ( 47 | not isinstance(arg, bytes)) and ( 48 | hasattr(arg, "__getitem__") or hasattr(arg, "__iter__")) 49 | 50 | 51 | def _is_bytes(arg): 52 | return isinstance(arg, bytes) 53 | 54 | 55 | def _unwrapper_single_element(elements): 56 | if len(elements) == 1: 57 | return elements[0] 58 | return elements 59 | 60 | 61 | class Error(Exception): 62 | """ 63 | Error type. 64 | """ 65 | pass 66 | 67 | 68 | _empty_comma = re.compile(r',(?=,)') 69 | 70 | WRITING_NATIVE = ('trans',) 71 | '''native target language writing system''' 72 | 73 | WRITING_ROMAN = ('translit',) 74 | '''romanlized writing system. only valid for some langauges, otherwise it outputs empty string''' 75 | 76 | WRITING_NATIVE_AND_ROMAN = WRITING_NATIVE + WRITING_ROMAN 77 | '''both native and roman writing. The output will be a tuple''' 78 | 79 | 80 | # noinspection HttpUrlsUsage,PyPep8Naming,PyShadowingNames,PyMissingOrEmptyDocstring,PyUnresolvedReferences 81 | class Goslate(object): 82 | """All goslate API lives in this class 83 | 84 | You have to first create an instance of Goslate to use this API 85 | 86 | :param writing: The translation writing system. Currently, 3 values are valid 87 | 88 | - :const:`WRITING_NATIVE` for native writing system 89 | - :const:`WRITING_ROMAN` for roman writing system 90 | - :const:`WRITING_NATIVE_AND_ROMAN` for both native and roman writing system. output will be a tuple in this case 91 | 92 | :param opener: The url opener to be used for HTTP/HTTPS query. 93 | If not provide, a default opener will be used. 94 | For proxy support you should provide an ``opener`` with ``ProxyHandler`` 95 | :type opener: `urllib2.OpenerDirector `_ 96 | 97 | :param retry_times: how many times to retry when connection reset error occured. Default to 4 98 | :type retry_times: int 99 | 100 | :type max_workers: int 101 | 102 | :param timeout: HTTP request timeout in seconds 103 | :type timeout: int/float 104 | 105 | :param debug: Turn on/off the debug output 106 | :type debug: bool 107 | 108 | :param service_urls: Google Translate url list. URLs will be used randomly for better concurrent performance. For example ``['http://translate.google.com', 'http://translate.google.de']`` 109 | :type service_urls: single string or a sequence of strings 110 | 111 | :param executor: the multi thread executor for handling batch input, default to a global ``futures.ThreadPoolExecutor`` instance with 120 max thead workers if ``futures`` is avalible. Set to None to disable multi thread support 112 | :type executor: ``futures.ThreadPoolExecutor`` 113 | 114 | .. note:: multi thread worker relys on `futures `_, if it is not avalible, ``goslate`` will work under single thread mode 115 | 116 | :Example: 117 | 118 | >>> import goslate 119 | >>> 120 | >>> # Create a Goslate instance first 121 | >>> gs = goslate.Goslate() 122 | >>> 123 | >>> # You could get all supported language list through get_languages 124 | >>> languages = gs.get_languages() 125 | >>> print(languages['en']) 126 | English 127 | >>> 128 | >>> # Tranlate English into German 129 | >>> print(gs.translate('hello', 'de')) 130 | hallo 131 | >>> # Detect the language of the text 132 | >>> print(gs.detect('some English words')) 133 | en 134 | >>> # Get goslate object dedicated for romanlized translation (romanlization) 135 | >>> gs_roman = goslate.Goslate(WRITING_ROMAN) 136 | >>> print(gs_roman.translate('hello', 'zh')) 137 | Nín hǎo 138 | """ 139 | _MAX_LENGTH_PER_QUERY = 1800 140 | 141 | def __init__(self, writing=WRITING_NATIVE, opener=None, retry_times=4, executor=_g_executor, 142 | timeout=4, service_urls=('http://translate.google.com',), debug=False): 143 | self._DEBUG = debug 144 | self._MIN_TASKS_FOR_CONCURRENT = 2 145 | self._opener = opener 146 | self._languages = None 147 | self._TIMEOUT = timeout 148 | 149 | self._RETRY_TIMES = retry_times 150 | self._executor = executor 151 | self._writing = writing 152 | if _is_sequence(service_urls): 153 | self._service_urls = service_urls 154 | else: 155 | self._service_urls = (service_urls,) 156 | 157 | def _open_url(self, url): 158 | if len(url) > self._MAX_LENGTH_PER_QUERY + 100: 159 | raise Error('input too large') 160 | 161 | # Google forbits urllib2 User-Agent: Python-urllib/2.7 162 | request = Request(url, headers={'User-Agent': 'Mozilla/4.0'}) 163 | 164 | if not self._opener: 165 | debuglevel = self._DEBUG and 1 or 0 166 | from urllib.request import HTTPSHandler 167 | self._opener = build_opener( 168 | HTTPHandler(debuglevel=debuglevel), 169 | HTTPSHandler(debuglevel=debuglevel)) 170 | 171 | exception = None 172 | # retry when get (, error(54, 'Connection reset by peer') 173 | for _ in range(self._RETRY_TIMES): 174 | try: 175 | response = self._opener.open(request, timeout=self._TIMEOUT) 176 | response_content = response.read().decode('utf-8') 177 | if self._DEBUG: 178 | print('GET Response body:{}'.format(response_content)) 179 | return response_content 180 | except socket.error as e: 181 | if self._DEBUG: 182 | import threading 183 | # noinspection PyDeprecation 184 | print(threading.currentThread(), e) 185 | if 'Connection reset by peer' not in str(e): 186 | raise e 187 | exception = e 188 | time.sleep(0.0001) 189 | raise exception 190 | 191 | def _execute(self, tasks): 192 | first_tasks = [next(tasks, None) for _ in range(self._MIN_TASKS_FOR_CONCURRENT)] 193 | tasks = (task for task in itertools.chain(first_tasks, tasks) if task) 194 | 195 | if not first_tasks[-1] or not self._executor: 196 | for each in tasks: 197 | yield each() 198 | else: 199 | exception = None 200 | for each in [self._executor.submit(t) for t in tasks]: 201 | if exception: 202 | each.cancel() 203 | else: 204 | exception = each.exception() 205 | if not exception: 206 | yield each.result() 207 | 208 | if exception: 209 | raise exception 210 | 211 | def _basic_translate(self, text, target_language, source_language): 212 | # assert _is_bytes(text) 213 | 214 | if not target_language: 215 | raise Error('invalid target language') 216 | 217 | if not text.strip(): 218 | return tuple(u'' for _ in range(len(self._writing))), str(target_language) 219 | 220 | # Browser request for 'hello world' is: 221 | # http://translate.google.com/translate_a/t?client=t&hl=en&sl=en&tl=zh-CN&ie=UTF-8&oe=UTF-8&multires=1&prev=conf&psl=en&ptl=en&otf=1&it=sel.2016&ssel=0&tsel=0&prev=enter&oc=3&ssel=0&tsel=0&sc=1&text=hello%20world 222 | 223 | # 2015-04: Google had changed service, it is now: 224 | # https://translate.google.com/translate_a/single?client=z&sl=en&tl=zh-CN&ie=UTF-8&oe=UTF-8&dt=t&dt=rm&q=hello%20world 225 | # dt=t: translate 226 | # dt=rm: romanlized writing, like Chinese Pinyin 227 | GOOGLE_TRASLATE_URL = urljoin(random.choice(self._service_urls), '/translate_a/single') 228 | GOOGLE_TRASLATE_PARAMETERS = { 229 | 'client': 'a', 230 | 'sl': source_language, 231 | 'tl': target_language, 232 | 'ie': 'UTF-8', 233 | 'oe': 'UTF-8', 234 | 'dt': 't', 235 | 'q': text, 236 | } 237 | 238 | url = '?'.join((GOOGLE_TRASLATE_URL, urlencode(GOOGLE_TRASLATE_PARAMETERS))) 239 | if 'translit' in self._writing: 240 | url += '&dt=rm' 241 | 242 | response_content = self._open_url(url) 243 | raw_data = json.loads(_empty_comma.subn('', response_content)[0].replace(u'\xA0', u' ').replace('[,', '[1,')) 244 | data = {'src': raw_data[-1][0][0]} 245 | 246 | if raw_data[0][-1][0] == 1: # roman writing 247 | data['translit'] = raw_data[0][-1][1] 248 | data['trans'] = u''.join(i[0] for i in raw_data[0][:-1]) 249 | else: 250 | data['translit'] = u'' 251 | data['trans'] = u''.join(i[0] for i in raw_data[0]) 252 | 253 | translation = tuple(data[part] for part in self._writing) 254 | 255 | detected_source_language = data['src'] 256 | return translation, detected_source_language 257 | 258 | def get_languages(self): 259 | """ 260 | Discover supported languages 261 | 262 | It returns iso639-1 language codes for 263 | `supported languages `_ 264 | for translation. Some language codes also include a country code, like zh-CN or zh-TW. 265 | 266 | .. note:: It only queries Google once for the first time and use cached result afterward 267 | 268 | :returns: a dict of all supported language code and language name mapping ``{'language-code', 'Language name'}`` 269 | 270 | :Example: 271 | 272 | >>> languages = Goslate().get_languages() 273 | >>> assert 'zh' in languages 274 | >>> print(languages['zh']) 275 | Chinese 276 | 277 | """ 278 | if self._languages: 279 | return self._languages 280 | 281 | GOOGLE_TRASLATOR_URL = 'http://translate.google.com/translate_a/l' 282 | GOOGLE_TRASLATOR_PARAMETERS = { 283 | 'client': 't', 284 | } 285 | 286 | url = '?'.join((GOOGLE_TRASLATOR_URL, urlencode(GOOGLE_TRASLATOR_PARAMETERS))) 287 | response_content = self._open_url(url) 288 | data = json.loads(response_content) 289 | 290 | languages = data['sl'] 291 | languages.update(data['tl']) 292 | if 'auto' in languages: 293 | del languages['auto'] 294 | if 'zh' not in languages: 295 | languages['zh'] = 'Chinese' 296 | self._languages = languages 297 | return self._languages 298 | 299 | _SEPERATORS = [quote_plus(i.encode()) for i in 300 | u'.!?,;。,?!::"“”’‘#$%&()()*×+/<=>@#¥[\\]…[]^`{|}{}~~\n\r\t '] 301 | 302 | def _translate_single_text(self, text, target_language, source_lauguage): 303 | assert _is_bytes(text) 304 | 305 | def split_text(text): 306 | start = 0 307 | text = quote_plus(text) 308 | length = len(text) 309 | while (length - start) > self._MAX_LENGTH_PER_QUERY: 310 | for seperator in self._SEPERATORS: 311 | index = text.rfind(seperator, start, start + self._MAX_LENGTH_PER_QUERY) 312 | if index != -1: 313 | break 314 | else: 315 | raise Error('input too large') 316 | end = index + len(seperator) 317 | yield unquote_plus(text[start:end]) 318 | start = end 319 | 320 | yield unquote_plus(text[start:]) 321 | 322 | def make_task(text): 323 | return lambda: self._basic_translate(text, target_language, source_lauguage)[0] 324 | 325 | results = list(self._execute(make_task(i) for i in split_text(text))) 326 | return tuple(''.join(i[n] for i in results) for n in range(len(self._writing))) 327 | 328 | def translate(self, text, target_language, source_language='auto'): 329 | """ 330 | Translate text from source language to target language. 331 | 332 | .. note:: 333 | 334 | - Input all source strings at once. Goslate will batch and fetch concurrently for maximize speed. 335 | - `futures `_ is required for best performance. 336 | - It returns generator on batch input to better fit pipeline architecture 337 | 338 | :param text: The source text(s) to be translated. Batch translation is supported via sequence input 339 | :type text: UTF-8 str; unicode; string sequence (list, tuple, iterator, generator) 340 | 341 | :param target_language: The language to translate the source text into. 342 | The value should be one of the language codes listed in :func:`get_languages` 343 | :type target_language: str; unicode 344 | 345 | :param source_language: The language of the source text. 346 | The value should be one of the language codes listed in :func:`get_languages`. 347 | If a language is not specified, 348 | the system will attempt to identify the source language automatically. 349 | :type source_language: str; unicode 350 | 351 | :returns: the translated text(s) 352 | 353 | - unicode: on single string input 354 | - generator of unicode: on batch input of string sequence 355 | - tuple: if WRITING_NATIVE_AND_ROMAN is specified, it will return tuple/generator for tuple (u"native", u"roman format") 356 | 357 | :raises: 358 | - :class:`Error` ('invalid target language') if target language is not set 359 | - :class:`Error` ('input too large') if input a single large word without any punctuation or space in between 360 | 361 | 362 | :Example: 363 | 364 | >>> gs = Goslate() 365 | >>> print(gs.translate('Hello World', 'de')) 366 | Hallo Welt 367 | >>> 368 | >>> for i in gs.translate(['good', u'morning'], 'de'): 369 | ... print(i) 370 | ... 371 | gut aus 372 | Morgen 373 | 374 | To output romanlized translation 375 | 376 | :Example: 377 | 378 | >>> gs_roman = Goslate(WRITING_ROMAN) 379 | >>> print(gs_roman.translate('Hello', 'zh')) 380 | Nín hǎo 381 | 382 | """ 383 | 384 | if not target_language: 385 | raise Error('invalid target language') 386 | 387 | if not source_language: 388 | source_language = 'auto' 389 | 390 | if target_language.lower() == 'zh': 391 | target_language = 'zh-CN' 392 | 393 | if source_language.lower() == 'zh': 394 | source_language = 'zh-CN' 395 | 396 | if not _is_sequence(text): 397 | if isinstance(text, str): 398 | text = text.encode() 399 | return _unwrapper_single_element(self._translate_single_text(text, target_language, source_language)) 400 | 401 | JOINT = u'\u26ff' 402 | UTF8_JOINT = (u'\n%s\n' % JOINT).encode() 403 | 404 | def join_texts(texts): 405 | def convert_to_utf8(texts): 406 | for i in texts: 407 | if isinstance(i, str): 408 | i = i.encode() 409 | yield i.strip() 410 | 411 | texts = convert_to_utf8(texts) 412 | text = next(texts) 413 | for i in texts: 414 | new_text = UTF8_JOINT.join((text, i)) 415 | if len(quote_plus(new_text)) < self._MAX_LENGTH_PER_QUERY: 416 | text = new_text 417 | else: 418 | yield text 419 | text = i 420 | yield text 421 | 422 | def make_task(text): 423 | def task(): 424 | r = self._translate_single_text(text, target_language, source_language) 425 | r = tuple([i.strip('\n') for i in n.split(JOINT)] for n in r) 426 | return izip(*r) 427 | # return r[0] 428 | 429 | return task 430 | 431 | return (_unwrapper_single_element(i) for i in 432 | itertools.chain.from_iterable(self._execute(make_task(i) for i in join_texts(text)))) 433 | 434 | def _detect_language(self, text): 435 | if _is_bytes(text): 436 | text = text.decode('utf-8') 437 | return self._basic_translate(text[:50].encode('utf-8'), 'en', 'auto')[1] 438 | 439 | def detect(self, text): 440 | """ 441 | Detect language of the input text. 442 | 443 | .. note:: 444 | 445 | - Input all source strings at once. Goslate will detect concurrently for maximize speed. 446 | - `futures `_ is required for best performance. 447 | - It returns generator on batch input to better fit pipeline architecture. 448 | 449 | :param text: The source text(s) whose language you want to identify. 450 | Batch detection is supported via sequence input 451 | :type text: UTF-8 str; unicode; sequence of string 452 | :returns: the language code(s) 453 | 454 | - unicode: on single string input 455 | - generator of unicode: on batch input of string sequence 456 | 457 | :raises: :class:`Error` if parameter type or value is not valid 458 | 459 | Example:: 460 | 461 | >>> gs = Goslate() 462 | >>> print(gs.detect('hello world')) 463 | en 464 | >>> for i in gs.detect([u'hello', 'Hallo']): 465 | ... print(i) 466 | ... 467 | en 468 | de 469 | 470 | """ 471 | if _is_sequence(text): 472 | return self._execute(functools.partial(self._detect_language, i) for i in text) 473 | return self._detect_language(text) 474 | 475 | 476 | def _main(argv): 477 | import optparse 478 | 479 | usage = "usage: %prog [options] \n will be used as input source if no file specified." 480 | 481 | parser = optparse.OptionParser(usage=usage, version="%%prog %s @ Copyright %s" % (__version__, __copyright__)) 482 | parser.add_option('-t', '--target-language', metavar='zh-CN', 483 | help='specify target language to translate the source text into') 484 | parser.add_option('-s', '--source-language', default='auto', metavar='en', 485 | help='specify source language, if not provide it will identify the source language automatically') 486 | parser.add_option('-i', '--input-encoding', default=sys.getfilesystemencoding(), metavar='utf-8', 487 | help='specify input encoding, default to current console system encoding') 488 | parser.add_option('-o', '--output-encoding', default=sys.getfilesystemencoding(), metavar='utf-8', 489 | help='specify output encoding, default to current console system encoding') 490 | parser.add_option('-r', '--roman', action="store_true", 491 | help='change translation writing to roman (e.g.: output pinyin instead of Chinese charactors for Chinese. It only valid for some of the target languages)') 492 | 493 | options, args = parser.parse_args(argv[1:]) 494 | 495 | if not options.target_language: 496 | print('Error: missing target language!') 497 | parser.print_help() 498 | return 499 | 500 | writing = WRITING_NATIVE 501 | if options.roman: 502 | writing = WRITING_ROMAN 503 | 504 | gs = Goslate(writing=writing) 505 | import fileinput 506 | # inputs = fileinput.input(args, mode='rU', openhook=fileinput.hook_encoded(options.input_encoding)) 507 | inputs = fileinput.input(args, mode='rb') 508 | inputs = (i.decode(options.input_encoding) for i in inputs) 509 | outputs = gs.translate(inputs, options.target_language, options.source_language) 510 | for i in outputs: 511 | sys.stdout.write((i + u'\n').encode(options.output_encoding)) 512 | sys.stdout.flush() 513 | -------------------------------------------------------------------------------- /PyMultiDictionary/_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | TOKENIZER 6 | Implements a tokenizer from nltk library. 7 | """ 8 | 9 | __all__ = ['RegexpTokenizer'] 10 | 11 | from abc import ABC, abstractmethod 12 | import re 13 | import types 14 | 15 | 16 | # noinspection PyMissingOrEmptyDocstring 17 | def overridden(method): 18 | if isinstance(method, types.MethodType) and method.__self__.__class__ is not None: 19 | name = method.__name__ 20 | funcs = [ 21 | cls.__dict__[name] 22 | for cls in _mro(method.__self__.__class__) 23 | if name in cls.__dict__ 24 | ] 25 | return len(funcs) > 1 26 | else: 27 | raise TypeError('Expected an instance method.') 28 | 29 | 30 | def _mro(cls): 31 | if isinstance(cls, type): 32 | return cls.__mro__ 33 | else: 34 | mro = [cls] 35 | for base in cls.__bases__: 36 | mro.extend(_mro(base)) 37 | return mro 38 | 39 | 40 | # noinspection PyShadowingBuiltins,PyMissingOrEmptyDocstring 41 | def regexp_span_tokenize(s, regexp): 42 | left = 0 43 | for m in re.finditer(regexp, s): 44 | right, next = m.span() 45 | if right != left: 46 | yield left, right 47 | left = next 48 | yield left, len(s) 49 | 50 | 51 | # noinspection PyTypeChecker 52 | class TokenizerI(ABC): 53 | """ 54 | A processing interface for tokenizing a string. 55 | Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both). 56 | """ 57 | 58 | @abstractmethod 59 | def tokenize(self, s): 60 | """ 61 | Return a tokenized copy of *s*. 62 | 63 | :rtype: list of str 64 | """ 65 | if overridden(self.tokenize_sents): 66 | return self.tokenize_sents([s])[0] 67 | return [] 68 | 69 | def span_tokenize(self, s): 70 | """ 71 | Identify the tokens using integer offsets ``(start_i, end_i)``, 72 | where ``s[start_i:end_i]`` is the corresponding token. 73 | 74 | :rtype: iter(tuple(int, int)) 75 | """ 76 | raise NotImplementedError() 77 | 78 | def tokenize_sents(self, strings): 79 | """ 80 | Apply ``self.tokenize()`` to each element of ``strings``. I.e.: 81 | 82 | return [self.tokenize(s) for s in strings] 83 | 84 | :rtype: list(list(str)) 85 | """ 86 | return [self.tokenize(s) for s in strings] 87 | 88 | def span_tokenize_sents(self, strings): 89 | """ 90 | Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: 91 | 92 | return [self.span_tokenize(s) for s in strings] 93 | 94 | :rtype: iter(list(tuple(int, int))) 95 | """ 96 | for s in strings: 97 | yield list(self.span_tokenize(s)) 98 | 99 | 100 | # noinspection PyMissingOrEmptyDocstring 101 | class RegexpTokenizer(TokenizerI): 102 | r""" 103 | A tokenizer that splits a string using a regular expression, which 104 | matches either the tokens or the separators between tokens. 105 | 106 | >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') 107 | 108 | :type pattern: str 109 | :param pattern: The pattern used to build this tokenizer. 110 | (This pattern must not contain capturing parentheses; 111 | Use non-capturing parentheses, e.g. (?:...), instead) 112 | :type gaps: bool 113 | :param gaps: True if this tokenizer's pattern should be used 114 | to find separators between tokens; False if this 115 | tokenizer's pattern should be used to find the tokens 116 | themselves. 117 | :type discard_empty: bool 118 | :param discard_empty: True if any empty tokens `''` 119 | generated by the tokenizer should be discarded. Empty 120 | tokens can only be generated if `_gaps == True`. 121 | :type flags: int 122 | :param flags: The regexp flags used to compile this 123 | tokenizer's pattern. By default, the following flags are 124 | used: `re.UNICODE | re.MULTILINE | re.DOTALL`. 125 | """ 126 | 127 | def __init__( 128 | self, 129 | pattern, 130 | gaps=False, 131 | discard_empty=True, 132 | flags=re.UNICODE | re.MULTILINE | re.DOTALL, 133 | ): 134 | # If they gave us a regexp object, extract the pattern. 135 | pattern = getattr(pattern, "pattern", pattern) 136 | 137 | self._pattern = pattern 138 | self._gaps = gaps 139 | self._discard_empty = discard_empty 140 | self._flags = flags 141 | self._regexp = None 142 | 143 | def _check_regexp(self): 144 | if self._regexp is None: 145 | self._regexp = re.compile(self._pattern, self._flags) 146 | 147 | def tokenize(self, text): 148 | self._check_regexp() 149 | # If our regexp matches gaps, use re.split: 150 | if self._gaps: 151 | if self._discard_empty: 152 | return [tok for tok in self._regexp.split(text) if tok] 153 | else: 154 | return self._regexp.split(text) 155 | 156 | # If our regexp matches tokens, use re.findall: 157 | else: 158 | return self._regexp.findall(text) 159 | 160 | def span_tokenize(self, text): 161 | self._check_regexp() 162 | 163 | if self._gaps: 164 | for left, right in regexp_span_tokenize(text, self._regexp): 165 | if not (self._discard_empty and left == right): 166 | yield left, right 167 | else: 168 | for m in re.finditer(self._regexp, text): 169 | yield m.span() 170 | 171 | def __repr__(self): 172 | return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format( 173 | self.__class__.__name__, 174 | self._pattern, 175 | self._gaps, 176 | self._discard_empty, 177 | self._flags, 178 | ) 179 | -------------------------------------------------------------------------------- /PyMultiDictionary/_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | DICTIONARY 6 | Dictionary object. 7 | """ 8 | 9 | __all__ = [ 10 | 'get_language_name', 11 | 'LANG_NAMES', 12 | 'tokenize' 13 | ] 14 | 15 | # noinspection PyPackageRequirements 16 | from iso639 import Lang 17 | # noinspection PyPackageRequirements 18 | from iso639.exceptions import InvalidLanguageValue 19 | 20 | from PyMultiDictionary._tokenizer import * 21 | 22 | # Tokenizer 23 | _TOKENIZER = RegexpTokenizer(r'\w+(?:-\w+)*') 24 | 25 | # Enhanced lang names 26 | LANG_NAMES = { 27 | 'bn': [('af', 'আফ্রিকান'), ('ar', 'আরবী'), ('bn', 'বাংলা'), ('de', 'জার্মান'), ('el', 'গ্রীক্\u200c'), 28 | ('en', 'ইংরেজী'), ('es', 'স্পেনীয়'), ('fr', 'ফরাসি'), ('hi', 'হিন্দি'), ('it', 'ইতালীয়'), ('ja', 'জাপানি'), 29 | ('jv', 'জাভানি'), ('ko', 'কোরিয়ান'), ('mr', 'মারাঠি'), ('ms', 'মালে'), ('no', 'নরওয়েজীয়'), 30 | ('pl', 'পোলীশ'), ('pt', 'পর্তুগীজ'), ('ro', 'রোমানীয়'), ('ru', 'রুশ'), ('sv', 'সুইডিশ'), ('ta', 'তামিল'), 31 | ('tr', 'তুর্কী'), ('uk', 'ইউক্রেনীয়'), ('vi', 'ভিয়েতনামিয়'), ('zh', 'চীনা')], 32 | 'de': [('af', 'Afrikaans'), ('ar', 'Arabisch'), ('bn', 'Bengalisch'), ('de', 'Deutsch'), ('el', 'Griechisch'), 33 | ('en', 'Englisch'), ('es', 'Spanisch'), ('fr', 'Französisch'), ('hi', 'Hindi'), ('it', 'Italienisch'), 34 | ('ja', 'Japanisch'), ('jv', 'Javanisch'), ('ko', 'Koreanisch'), ('mr', 'Marathi'), ('ms', 'Malaysisch'), 35 | ('no', 'Norwegisch'), ('pl', 'Polnisch'), ('pt', 'Portugiesisch'), ('ro', 'Rumänisch'), ('ru', 'Russisch'), 36 | ('sv', 'Schwedisch'), ('ta', 'Tamil'), ('tr', 'Türkisch'), ('uk', 'Ukrainisch'), ('vi', 'Vietnamesisch'), 37 | ('zh', 'Chinesisch')], 38 | 'en': [('af', 'Afrikaans'), ('ar', 'Arabic'), ('bn', 'Bengali'), ('de', 'German'), ('el', 'Greek'), 39 | ('en', 'English'), ('es', 'Spanish'), ('fr', 'French'), ('hi', 'Hindi'), ('it', 'Italian'), 40 | ('ja', 'Japanese'), ('jv', 'Javanese'), ('ko', 'Korean'), ('mr', 'Marathi'), ('ms', 'Malay'), 41 | ('no', 'Norwegian'), ('pl', 'Polish'), ('pt', 'Portuguese'), ('ro', 'Romanian'), ('ru', 'Russian'), 42 | ('sv', 'Swedish'), ('ta', 'Tamil'), ('tr', 'Turkish'), ('uk', 'Ukrainian'), ('vi', 'Vietnamese'), 43 | ('zh', 'Chinese')], 44 | 'es': [('af', 'Afrikáans'), ('ar', 'Árabe'), ('bn', 'Bengalí'), ('de', 'Alemán'), ('el', 'Griego'), 45 | ('en', 'Inglés'), ('es', 'Español'), ('fr', 'Francés'), ('hi', 'Hindi'), ('it', 'Italiano'), 46 | ('ja', 'Japonés'), ('jv', 'Javanés'), ('ko', 'Coreano'), ('mr', 'Maratí'), ('ms', 'Malayo'), 47 | ('no', 'Noruego'), ('pl', 'Polaco'), ('pt', 'Portugués'), ('ro', 'Rumano'), ('ru', 'Ruso'), ('sv', 'Sueco'), 48 | ('ta', 'Tamil'), ('tr', 'Turco'), ('uk', 'Ucraniano'), ('vi', 'Vietnamita'), ('zh', 'Chino')], 49 | 'fr': [('af', 'Afrikaans'), ('ar', 'Arabe'), ('bn', 'Bengali'), ('de', 'Allemand'), ('el', 'Grec'), 50 | ('en', 'Anglais'), ('es', 'Espagnol'), ('fr', 'Français'), ('hi', 'Hindi'), ('it', 'Italien'), 51 | ('ja', 'Japonais'), ('jv', 'Javanais'), ('ko', 'Coréen'), ('mr', 'Marathi'), ('ms', 'Malaisien'), 52 | ('no', 'Norvégien'), ('pl', 'Polonais'), ('pt', 'Portugais'), ('ro', 'Roumain'), ('ru', 'Russe'), 53 | ('sv', 'Suédois'), ('ta', 'Tamoul'), ('tr', 'Turc'), ('uk', 'Ukrainien'), ('vi', 'Vietnamien'), 54 | ('zh', 'Chinois')], 55 | 'hi': [('af', 'अफ़्रीकांस'), ('ar', 'अरबी'), ('bn', 'बांग्ला'), ('de', 'जर्मन'), ('el', 'ग्रीक'), 56 | ('en', 'अंग्रेज़ी'), ('es', 'स्पैनिश'), ('fr', 'फ़्रेंच'), ('hi', 'हिन्दी'), ('it', 'इटैलियन'), 57 | ('ja', 'जापानी'), ('jv', 'जैवेनीज़'), ('ko', 'कोरियन'), ('mr', 'मराठी'), ('ms', 'मलय'), ('no', 'नॉर्वेजियन'), 58 | ('pl', 'पोलिश'), ('pt', 'पुर्तगाली'), ('ro', 'रोमेनियन'), ('ru', 'रूसी'), ('sv', 'स्वीडिश'), ('ta', 'तमिल'), 59 | ('tr', 'तुर्क'), ('uk', 'यूक्रेनियन'), ('vi', 'वियतनामी'), ('zh', 'चीनी')], 60 | 'it': [('af', 'Afrikaans'), ('ar', 'Arabo'), ('bn', 'Bengalese'), ('de', 'Tedesco'), ('el', 'Greco'), 61 | ('en', 'Inglese'), ('es', 'Spagnolo'), ('fr', 'Francese'), ('hi', 'Hindi'), ('it', 'Italiano'), 62 | ('ja', 'Giapponese'), ('jv', 'Giavanese'), ('ko', 'Coreano'), ('mr', 'Marathi'), ('ms', 'Malese'), 63 | ('no', 'Norvegese'), ('pl', 'Polacco'), ('pt', 'Portoghese'), ('ro', 'Rumeno'), ('ru', 'Russo'), 64 | ('sv', 'Svedese'), ('ta', 'Tamil'), ('tr', 'Turco'), ('uk', 'Ucraino'), ('vi', 'Vietnamita'), 65 | ('zh', 'Cinese')], 66 | 'ja': [('af', 'アフリカーンス語'), ('ar', 'アラビア語'), ('bn', 'ベンガル語'), ('de', 'ドイツ語'), 67 | ('el', 'ギリシャ語'), ('en', '英語'), 68 | ('es', 'スペイン語'), ('fr', 'フランス語'), ('hi', 'ヒンディー語'), ('it', 'イタリア語'), ('ja', '日本語'), 69 | ('jv', 'ジャワ語'), 70 | ('ko', '韓国語'), ('mr', 'マラーティー語'), ('ms', 'マレー語'), ('no', 'ノルウェー語'), 71 | ('pl', 'ポーランド語'), ('pt', 'ポルトガル語'), 72 | ('ro', 'ルーマニア語'), ('ru', 'ロシア語'), ('sv', 'スウェーデン語'), ('ta', 'タミル語'), ('tr', 'トルコ語'), 73 | ('uk', 'ウクライナ語'), 74 | ('vi', 'ベトナム語'), ('zh', '中国語')], 75 | 'jv': [('af', 'Basa afrikaans'), ('ar', 'Basa arab'), ('bn', 'Basa bengali'), ('de', 'Basa jerman'), 76 | ('el', 'Basa yunani'), ('en', 'Basa inggris'), ('es', 'Basa spanyol'), ('fr', 'Basa prancis'), 77 | ('hi', 'Basa india'), ('it', 'Basa italia'), ('ja', 'Basa jepang'), ('jv', 'Basa jawa'), 78 | ('ko', 'Basa korea'), ('mr', 'Basa marathi'), ('ms', 'Basa malaysia'), ('no', 'Basa norwegia'), 79 | ('pl', 'Basa polandia'), ('pt', 'Basa portugis'), ('ro', 'Basa romawi'), ('ru', 'Basa rusia'), 80 | ('sv', 'Basa swedia'), ('ta', 'Basa tamil'), ('tr', 'Basa turki'), ('uk', 'Basa ukrania'), 81 | ('vi', 'Basa vietnam'), ('zh', 'Basa cina')], 82 | 'ko': [('af', '아프리칸스어'), ('ar', '아랍어'), ('bn', '벵골어'), ('de', '독일어'), ('el', '그리스어'), ('en', '영어'), ('es', '스페인어'), 83 | ('fr', '프랑스어'), ('hi', '힌디어'), ('it', '이탈리아어'), ('ja', '일본어'), ('jv', '자바어'), ('ko', '한국어'), ('mr', '마라티어'), 84 | ('ms', '말레이어'), ('no', '노르웨이어'), ('pl', '폴란드어'), ('pt', '포르투갈어'), ('ro', '루마니아어'), ('ru', '러시아어'), 85 | ('sv', '스웨덴어'), ('ta', '타밀어'), ('tr', '터키어'), ('uk', '우크라이나어'), ('vi', '베트남어'), ('zh', '중국어')], 86 | 'mr': [('af', 'अफ्रिकान्स'), ('ar', 'अरबी'), ('bn', 'बंगाली'), ('de', 'जर्मन'), ('el', 'ग्रीक'), ('en', 'इंग्रजी'), 87 | ('es', 'स्पॅनिश'), ('fr', 'फ्रेंच'), ('hi', 'हिन्दी'), ('it', 'इटालियन'), ('ja', 'जपानी'), ('jv', 'जावानीज'), 88 | ('ko', 'कोरियन'), ('mr', 'मराठी'), ('ms', 'मलय'), ('no', 'नॉर्वेजियन'), ('pl', 'पोलिश'), ('pt', 'पोर्तुगीज'), 89 | ('ro', 'रोमानियन'), ('ru', 'रशियन'), ('sv', 'स्वीडिश'), ('ta', 'तमिळ'), ('tr', 'तुर्की'), 90 | ('uk', 'युक्रेनियन'), ('vi', 'व्हिएतनामी'), ('zh', 'चीनी')], 91 | 'ms': [('af', 'Afrikaans'), ('ar', 'Amhara'), ('bn', 'Basque'), ('de', 'Chichewa'), ('el', 'Cina'), 92 | ('en', 'Corsica'), ('es', 'Czech'), ('fr', 'Frisia'), ('hi', 'Hindi'), ('it', 'Itali'), ('ja', 'Jepun'), 93 | ('jv', 'Jerman'), ('ko', 'Kreol haiti'), ('mr', 'Marathi'), ('ms', 'Melayu'), ('no', 'Parsi'), 94 | ('pl', 'Poland'), ('pt', 'Punjabi'), ('ro', 'Romania'), ('ru', 'Rusia'), ('sv', 'Swahili'), 95 | ('ta', 'Tagalog'), ('tr', 'Turki'), ('uk', 'Ukraine'), ('vi', 'Vietnam'), ('zh', 'Cina')], 96 | 'pl': [('af', 'Afrikaans'), ('ar', 'Arabski'), ('bn', 'Bengalski'), ('de', 'Niemiecki'), ('el', 'Grecki'), 97 | ('en', 'Angielski'), ('es', 'Hiszpański'), ('fr', 'Francuski'), ('hi', 'Hindi'), ('it', 'Włoski'), 98 | ('ja', 'Japoński'), ('jv', 'Jawajski'), ('ko', 'Koreański'), ('mr', 'Marathi'), ('ms', 'Malajski'), 99 | ('no', 'Norweski'), ('pl', 'Polski'), ('pt', 'Portugalski'), ('ro', 'Rumuński'), ('ru', 'Rosyjski'), 100 | ('sv', 'Szwedzki'), ('ta', 'Tamilski'), ('tr', 'Turecki'), ('uk', 'Ukraiński'), ('vi', 'Wietnamski'), 101 | ('zh', 'Chiński')], 102 | 'pt': [('af', 'Africâner'), ('ar', 'Arabe'), ('bn', 'Bengali'), ('de', 'Alemão'), ('el', 'Grego'), ('en', 'Inglês'), 103 | ('es', 'Espanhol'), ('fr', 'Francês'), ('hi', 'Hindi'), ('it', 'Italiano'), ('ja', 'Japonês'), 104 | ('jv', 'Javanês'), ('ko', 'Coreano'), ('mr', 'Marata'), ('ms', 'Malaio'), ('no', 'Norueguês'), 105 | ('pl', 'Polonês'), ('pt', 'Português'), ('ro', 'Romeno'), ('ru', 'Russo'), ('sv', 'Sueco'), ('ta', 'Tâmil'), 106 | ('tr', 'Turco'), ('uk', 'Ucraniano'), ('vi', 'Vietnamita'), ('zh', 'Chinês')], 107 | 'ro': [('af', 'Afrikaans'), ('ar', 'Arabă'), ('bn', 'Bengali'), ('de', 'Germană'), ('el', 'Greacă'), 108 | ('en', 'Engleză'), ('es', 'Spaniolă'), ('fr', 'Franceză'), ('hi', 'Hindi'), ('it', 'Italiană'), 109 | ('ja', 'Japoneză'), ('jv', 'Javaneză'), ('ko', 'Coreeană'), ('mr', 'Marathi'), ('ms', 'Malaeză'), 110 | ('no', 'Norvegiană'), ('pl', 'Poloneză'), ('pt', 'Portugheză'), ('ro', 'Română'), ('ru', 'Rusă'), 111 | ('sv', 'Suedeză'), ('ta', 'Tamilă'), ('tr', 'Turcă'), ('uk', 'Ucraineană'), ('vi', 'Vietnameză'), 112 | ('zh', 'Chineză')], 113 | 'ru': [('af', 'Африкаанс'), ('ar', 'Арабский'), ('bn', 'Бенгальский'), ('de', 'Немецкий'), ('el', 'Греческий'), 114 | ('en', 'Английский'), ('es', 'Испанский'), ('fr', 'Французский'), ('hi', 'Хинди'), ('it', 'Итальянский'), 115 | ('ja', 'Японский'), ('jv', 'Яванский'), ('ko', 'Корейский'), ('mr', 'Маратхи'), ('ms', 'Малайский'), 116 | ('no', 'Норвежский'), ('pl', 'Польский'), ('pt', 'Португальский'), ('ro', 'Румынский'), ('ru', 'Русский'), 117 | ('sv', 'Шведский'), ('ta', 'Тамильский'), ('tr', 'Турецкий'), ('uk', 'Украинский'), ('vi', 'Вьетнамский'), 118 | ('zh', 'Китайский')], 119 | 'ta': [('af', 'ஆஃப்ரிக்கான்ஸ்'), ('ar', 'அரபிக்'), ('bn', 'வங்காளம்'), ('de', 'ஜெர்மன்'), ('el', 'கிரேக்கம்'), 120 | ('en', 'ஆங்கிலம்'), ('es', 'ஸ்பானிஷ்'), ('fr', 'ஃபிரெஞ்சு'), ('hi', 'இந்தி'), ('it', 'இத்தாலியன்'), 121 | ('ja', 'ஜாப்பனிஸ்'), ('jv', 'ஜாவனீஸ்'), ('ko', 'கொரியன்'), ('mr', 'மராத்தி'), ('ms', 'மலாய்'), 122 | ('no', 'நார்வீஜியன்'), ('pl', 'போலிஷ்'), ('pt', 'போர்ச்சுகீஸ்'), ('ro', 'ருமேனியன்'), ('ru', 'ரஷ்யன்'), 123 | ('sv', 'ஸ்வீடிஷ்'), ('ta', 'தமிழ்'), ('tr', 'துருக்கியம்'), ('uk', 'உக்ரைனியன்'), ('vi', 'வியட்னாமீஸ்'), 124 | ('zh', 'சீனம்')], 125 | 'tr': [('af', 'Afrika dili'), ('ar', 'Arapça'), ('bn', 'Bengalce'), ('de', 'Almanca'), ('el', 'Yunanca'), 126 | ('en', 'İngilizce'), ('es', 'İspanyolca'), ('fr', 'Fransızca'), ('hi', 'Hintçe'), ('it', 'İtalyanca'), 127 | ('ja', 'Japonca'), ('jv', 'Cava dili'), ('ko', 'Korece'), ('mr', 'Marathi'), ('ms', 'Malezya dili'), 128 | ('no', 'Norveççe'), ('pl', 'Lehçe'), ('pt', 'Portekizce'), ('ro', 'Romence'), ('ru', 'Rusça'), 129 | ('sv', 'İsveççe'), ('ta', 'Tamil'), ('tr', 'Türkçe'), ('uk', 'Ukraynaca'), ('vi', 'Vietnamca'), 130 | ('zh', 'Çince')], 131 | 'uk': [('af', 'Африкаанс'), ('ar', 'Арабська'), ('bn', 'Бенгальська'), ('de', 'Німецька'), ('el', 'Грецька'), 132 | ('en', 'Англійська'), ('es', 'Іспанська'), ('fr', 'Французька'), ('hi', 'Гінді'), ('it', 'Італійська'), 133 | ('ja', 'Японська'), ('jv', 'Яванська'), ('ko', 'Корейська'), ('mr', 'Маратхі'), ('ms', 'Малайська'), 134 | ('no', 'Норвезька'), ('pl', 'Польська'), ('pt', 'Португальська'), ('ro', 'Румунська'), ('ru', 'Російська'), 135 | ('sv', 'Шведська'), ('ta', 'Тамільська'), ('tr', 'Турецька'), ('uk', 'Українська'), ('vi', 'В’єтнамська'), 136 | ('zh', 'Китайська')], 137 | 'zh': [('af', '布尔语(南非荷兰语)'), ('ar', '阿拉伯语'), ('bn', '孟加拉语'), ('de', '德语'), ('el', '希腊语'), 138 | ('en', '英语'), 139 | ('es', '西班牙语'), ('fr', '法语'), ('hi', '印地语'), ('it', '意大利语'), ('ja', '日语'), 140 | ('jv', '印尼爪哇语'), ('ko', '韩语'), 141 | ('mr', '马拉地语'), ('ms', '马来语'), ('no', '挪威语'), ('pl', '波兰语'), ('pt', '葡萄牙语'), 142 | ('ro', '罗马尼亚语'), ('ru', '俄语'), 143 | ('sv', '瑞典语'), ('ta', '泰米尔语'), ('tr', '土耳其语'), ('uk', '乌克兰语'), ('vi', '越南语'), 144 | ('zh', '中文')] 145 | 146 | } 147 | 148 | 149 | def get_language_name(tag: str, lang: str = '') -> str: 150 | """ 151 | Returns a language name from its tag. 152 | 153 | :param tag: Language tag (ISO 639) 154 | :param lang: Target language (ISO 639). If not supported, will return the English name 155 | :return: Language name 156 | """ 157 | assert isinstance(tag, str) 158 | assert isinstance(lang, str) 159 | if lang != '': 160 | if lang in LANG_NAMES.keys(): 161 | for j in LANG_NAMES[lang]: 162 | if j[0] == tag: 163 | return j[1] 164 | try: 165 | return Lang(tag).name 166 | except InvalidLanguageValue: 167 | return 'Unknown' 168 | 169 | 170 | def tokenize(s: str) -> str: 171 | """ 172 | Tokenize a given word. 173 | 174 | :param s: Word 175 | :return: Tokenized word 176 | """ 177 | # Pre-process 178 | s = str(s) 179 | s = s.replace('_', ' ') # Remove underscore 180 | s = s.replace('–', '-') # uniform chars 181 | s = ''.join([i for i in s if not i.isdigit()]) # remove digits 182 | 183 | # Tokenize 184 | tok = _TOKENIZER.tokenize(s) 185 | if len(tok) >= 1: 186 | return ' '.join(_TOKENIZER.tokenize(s)) 187 | else: 188 | return '' 189 | -------------------------------------------------------------------------------- /PyMultiDictionary/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | VERSION 6 | Defines version. 7 | """ 8 | 9 | __all__ = ['Version', 'vernum', 'ver', 'rev'] 10 | 11 | 12 | class Version(tuple): 13 | """ 14 | Version class. 15 | """ 16 | 17 | __slots__ = () 18 | fields = 'major', 'minor', 'patch' 19 | 20 | def __new__(cls, major, minor, patch) -> tuple: 21 | return tuple.__new__(cls, (major, minor, patch)) 22 | 23 | def __repr__(self) -> str: 24 | fields = (f'{fld}={val}' for fld, val in zip(self.fields, self)) 25 | return f'{self.__class__.__name__}({", ".join(fields)})' 26 | 27 | def __str__(self) -> str: 28 | return '{}.{}.{}'.format(*self) 29 | 30 | major = property(lambda self: self[0]) 31 | minor = property(lambda self: self[1]) 32 | patch = property(lambda self: self[2]) 33 | 34 | 35 | vernum = Version(1, 3, 2) 36 | ver = str(vernum) 37 | rev = '' 38 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | ================= 3 | PyMultiDictionary 4 | ================= 5 | 6 | .. image:: https://img.shields.io/badge/author-Pablo%20Pizarro%20R.-lightgray.svg 7 | :target: https://ppizarror.com 8 | :alt: @ppizarror 9 | 10 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 11 | :target: https://opensource.org/licenses/MIT 12 | :alt: License MIT 13 | 14 | .. image:: https://img.shields.io/badge/python-3.7+-red.svg 15 | :target: https://www.python.org/downloads 16 | :alt: Python 3.7+ 17 | 18 | .. image:: https://badge.fury.io/py/PyMultiDictionary.svg 19 | :target: https://pypi.org/project/PyMultiDictionary 20 | :alt: PyPi package 21 | 22 | .. image:: https://img.shields.io/github/actions/workflow/status/ppizarror/PyMultiDictionary/ci.yml?branch=master 23 | :target: https://github.com/ppizarror/PyMultiDictionary/actions/workflows/ci.yml 24 | :alt: Build status 25 | 26 | .. image:: https://app.fossa.com/api/projects/git%2Bgithub.com%2Fppizarror%2FPyMultiDictionary.svg?type=shield 27 | :target: https://app.fossa.com/projects/git%2Bgithub.com%2Fppizarror%2FPyMultiDictionary?ref=badge_shield 28 | :alt: FOSSA Status 29 | 30 | .. image:: https://codecov.io/gh/ppizarror/PyMultiDictionary/branch/master/graph/badge.svg 31 | :target: https://codecov.io/gh/ppizarror/PyMultiDictionary 32 | :alt: Codecov 33 | 34 | .. image:: https://img.shields.io/github/issues/ppizarror/PyMultiDictionary 35 | :target: https://github.com/ppizarror/PyMultiDictionary/issues 36 | :alt: Open issues 37 | 38 | .. image:: https://img.shields.io/pypi/dm/PyMultiDictionary?color=purple 39 | :target: https://pypi.org/project/PyMultiDictionary 40 | :alt: PyPi downloads 41 | 42 | .. image:: https://static.pepy.tech/personalized-badge/PyMultiDictionary?period=total&units=international_system&left_color=grey&right_color=lightgrey&left_text=total%20downloads 43 | :target: https://pepy.tech/project/PyMultiDictionary 44 | :alt: Total downloads 45 | 46 | .. image:: https://img.shields.io/badge/buy%20me%20a-Ko--fi-02b9fe 47 | :target: https://ko-fi.com/ppizarror 48 | :alt: Buy me a Ko-fi 49 | 50 | PyMultiDictionary is a dictionary module for Python 3+ to get meanings, translations, 51 | synonyms and antonyms of words in 20 different languages. It uses educalingo.com, 52 | synonym.com, and Merriam-Webster for getting meanings, translations, synonyms, and antonyms. 53 | 54 | Supported languages 55 | ------------------- 56 | 57 | - Bengali (**bn**) 58 | - German (**de**) 59 | - English (**en**) 60 | - Spanish (**es**) 61 | - French (**fr**) 62 | - Hindi (**hi**) 63 | - Italian (**it**) 64 | - Japanese (**ja**) 65 | - Javanese (**jv**) 66 | - Korean (**ko**) 67 | - Marathi (**mr**) 68 | - Malay (**ms**) 69 | - Polish (**pl**) 70 | - Portuguese (**pt**) 71 | - Romanian (**ro**) 72 | - Russian (**ru**) 73 | - Tamil (**ta**) 74 | - Turkish (**tr**) 75 | - Ukranian (**uk**) 76 | - Chinese (**zh**) 77 | 78 | Install Instructions 79 | -------------------- 80 | 81 | PyMultiDictionary can be installed via pip, for both MacOS, Windows & Linux. Simply run: 82 | 83 | .. code-block:: bash 84 | 85 | $> python3 -m pip install --upgrade PyMultiDictionary 86 | 87 | Usage 88 | ----- 89 | 90 | PyMultiDictionary can be utilized in 2 ways, either by creating a dictionary instance 91 | which can take words as arguments or by creating a dictionary instance with a fixed 92 | amount of words. 93 | 94 | **Create a dictionary object**: 95 | 96 | For example, 97 | 98 | .. code-block:: python 99 | 100 | from PyMultiDictionary import MultiDictionary 101 | dictionary = MultiDictionary() 102 | 103 | This will create a local instance of the MultiDictionary class, and now it can 104 | be used to get meanings, translations, etc. 105 | 106 | For **Meanings**, 107 | 108 | .. code-block:: python 109 | 110 | print(dictionary.meaning('en', 'good')) 111 | 112 | This will return a tuple containing the meanings of the word, in the format 113 | *(word_type, word_meaning, word_wikipedia)*. For example, the above code will return: 114 | 115 | .. code-block:: python 116 | 117 | (['Noun', 'Adjective', 'Exclamation'], 118 | 'The first definition of good in the dictionary is having admirable ...', 119 | 'Good may refer to: ▪ Good and evil, the distinction between positive...') 120 | 121 | All methods support other dictionaries, for example, 'Merriam-Webster' can be used 122 | for English words. 123 | 124 | .. code-block:: python 125 | 126 | from PyMultiDictionary import MultiDictionary, DICT_MW 127 | dictionary = MultiDictionary() 128 | print(dictionary.meaning('en', 'good', dictionary=DICT_MW)) 129 | 130 | Will return: 131 | 132 | .. code-block:: python 133 | 134 | { 135 | 'adjective': ['of a favorable character or tendency', ...], 136 | 'noun': ['something that is good', ...], 137 | 'adverb': ['well'] 138 | } 139 | 140 | For **Synonyms**, 141 | 142 | .. code-block:: python 143 | 144 | print(dictionary.synonym('es', 'Bueno')) 145 | 146 | This will return a list containing the Synonyms of the word. 147 | 148 | For **Antonyms**, 149 | 150 | .. code-block:: python 151 | 152 | print(dictionary.antonym('en', 'Life')) 153 | 154 | This will return a list containing the Antonyms of the word. Currently, only English is supported. 155 | 156 | For **Translations**, 157 | 158 | .. code-block:: python 159 | 160 | print(dictionary.translate('en', 'Range')) 161 | 162 | This will return the word 'Range' translation in 20 different languages. 163 | You can also extend the scope of the translations by providing a target language, 164 | which will use Google Translate API, for example: 165 | 166 | .. code-block:: python 167 | 168 | print(dictionary.translate('en', 'Range', to='ru')) 169 | 170 | Alternatively, you can set a fixed number of words to the Dictionary Instance. This 171 | is helpful if you want to get the meanings of some words quickly without any development need. 172 | 173 | Example: 174 | 175 | .. code-block:: python 176 | 177 | from PyMultiDictionary import MultiDictionary, DICT_EDUCALINGO 178 | 179 | dictionary=MultiDictionary('hotel', 'ambush', 'nonchalant', 'perceptive') 180 | dictionary.set_words_lang('en') # All words are English 181 | 182 | print(dictionary.get_meanings(dictionary=DICT_EDUCALINGO)) # This print the meanings of all the words 183 | print(dictionary.get_synonyms()) # Get synonyms list 184 | print(dictionary.get_antonyms()) # Get antonyms 185 | print(dictionary.get_translations()) # This will translate all words to over 20 languages 186 | print(dictionary.get_translations(to='ru')) # This will translate all words to Russian (if Google API is available) 187 | 188 | Supported dictionaries 189 | ---------------------- 190 | 191 | - **DICT_EDUCALINGO**: Meaning, synonym, translation for all languages 192 | - **DICT_MW**: Meanings (English) - Merriam-Webster 193 | - **DICT_SYNONYMCOM**: Synonyms and Antonyms (English) 194 | - **DICT_THESAURUS**: Synonyms (English) 195 | 196 | There are many more dictionaries to come. Just contribute to this repo! 197 | 198 | Author 199 | ------ 200 | 201 | `Pablo Pizarro R. `_ | 2021 - 2025 202 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | BUILD. 6 | """ 7 | 8 | import os 9 | import shutil 10 | import sys 11 | 12 | assert len(sys.argv) == 2, 'Argument is required, usage: build.py pip/twine' 13 | mode = sys.argv[1].strip() 14 | 15 | if mode == 'pip': 16 | if os.path.isdir('dist/'): 17 | for k in os.listdir('dist/'): 18 | if 'pymultidictionary-' in k: 19 | os.remove(f'dist/{k}') 20 | if os.path.isdir('build'): 21 | for k in os.listdir('build'): 22 | if 'bdist.' in k or k == 'lib': 23 | shutil.rmtree(f'build/{k}') 24 | os.system(f'python setup.py sdist bdist_wheel') 25 | 26 | elif mode == 'twine': 27 | if os.path.isdir('dist/'): 28 | os.system(f'python -m twine upload dist/*') 29 | else: 30 | raise FileNotFoundError('Not distribution been found, execute build.py pip') 31 | 32 | else: 33 | raise ValueError(f'Unknown mode {mode}') 34 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "build.py" 3 | - "PyMultiDictionary/_goslate.py" 4 | - "PyMultiDictionary/_tokenizer.py" 5 | - "setup.py" 6 | - "test/*.py" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | iso639-lang 3 | requests -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | SETUP DISTRIBUTION 6 | Create setup for PyPI. 7 | """ 8 | 9 | from setuptools import setup, find_packages 10 | import PyMultiDictionary 11 | 12 | # Load readme 13 | with open('README.rst', encoding='utf-8') as f: 14 | long_description = f.read() 15 | 16 | # Load requirements 17 | with open('requirements.txt', encoding='utf-8') as f: 18 | requirements = [] 19 | for line in f: 20 | requirements.append(line.strip()) 21 | 22 | # Setup library 23 | setup( 24 | name=PyMultiDictionary.__module_name__, 25 | version=PyMultiDictionary.__version__, 26 | author=PyMultiDictionary.__author__, 27 | author_email=PyMultiDictionary.__email__, 28 | description=PyMultiDictionary.__description__, 29 | long_description=long_description, 30 | url=PyMultiDictionary.__url__, 31 | project_urls={ 32 | 'Bug Tracker': PyMultiDictionary.__url_bug_tracker__, 33 | 'Documentation': PyMultiDictionary.__url_documentation__, 34 | 'Source Code': PyMultiDictionary.__url_source_code__ 35 | }, 36 | license=PyMultiDictionary.__license__, 37 | platforms=['any'], 38 | keywords=PyMultiDictionary.__keywords__, 39 | classifiers=[ 40 | 'License :: OSI Approved :: MIT License', 41 | 'Natural Language :: English', 42 | 'Operating System :: OS Independent', 43 | 'Programming Language :: Python :: 3.7', 44 | 'Programming Language :: Python :: 3.8', 45 | 'Programming Language :: Python :: 3.9', 46 | 'Programming Language :: Python :: 3.10', 47 | 'Programming Language :: Python :: 3.11', 48 | 'Programming Language :: Python :: 3.12', 49 | 'Programming Language :: Python', 50 | 'Topic :: Multimedia', 51 | 'Topic :: Text Processing' 52 | ], 53 | include_package_data=True, 54 | packages=find_packages(exclude=['test']), 55 | python_requires='>=3.7, <4', 56 | install_requires=requirements, 57 | extras_require={ 58 | 'docs': ['sphinx<7', 'sphinx-autodoc-typehints>=1.2.0', 'sphinx-rtd-theme'], 59 | 'test': ['nose2[coverage_plugin]', 'pytest'] 60 | }, 61 | setup_requires=[ 62 | 'setuptools', 63 | ], 64 | options={ 65 | 'bdist_wheel': {'universal': False} 66 | } 67 | ) 68 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | TESTS 6 | """ 7 | -------------------------------------------------------------------------------- /test/data/synonyms_en_not-bad.txt: -------------------------------------------------------------------------------- 1 | 2 | Another word for NOT BAD > Synonyms & Antonyms
Etymology

1. not

adverb. ['ˈnɑːt'] negation of a word or group of words.

Etymology

  • not (Middle English (1100-1500))

Rhymes with Not Bad

  • sociedad
  • shahrzad
  • ciudad
  • shabad
  • mossad
  • mirad
  • hlad
  • forbad
  • arvad
  • vlad
  • scad
  • riyadh
  • riyad
  • plaid
  • grad
  • gladd
  • glad
  • flad
  • clad
  • brad
  • thad
  • tadd
  • tad
  • shadd
  • shad
  • schad
  • sad
  • radde
  • rad
  • pad

3. bad

adjective. ['ˈbæd'] very intense.

Synonyms

Antonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

4. bad

adjective. ['ˈbæd'] feeling physical discomfort or pain (tough' is occasionally used colloquially forbad').

Synonyms

Antonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

5. bad

noun. ['ˈbæd'] that which is below standard or expectations as of ethics or decency.

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

6. bad

adjective. ['ˈbæd'] (of foodstuffs) not in an edible or usable condition.

Synonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

7. bad

adverb. ['ˈbæd'] very much; strongly.

Antonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

8. bad

adverb. ['ˈbæd'] with great intensity (bad' is a nonstandard variant forbadly').

Antonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

9. bad

adjective. ['ˈbæd'] characterized by wickedness or immorality.

Antonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))

10. bad

adjective. ['ˈbæd'] below average in quality or performance.

Antonyms

Etymology

  • bad (Middle English (1100-1500))
  • bæd (Old English (ca. 450-1100))
-------------------------------------------------------------------------------- /test/test_dictionary.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | TEST DICTIONARY 6 | Test dictionary object. 7 | """ 8 | 9 | from PyMultiDictionary import * 10 | # noinspection PyProtectedMember 11 | from PyMultiDictionary._dictionary import InvalidLangCode, InvalidDictionary, DictionaryLangNotDefined 12 | import os 13 | import unittest 14 | 15 | _actualpath = str(os.path.abspath(os.path.dirname(__file__))).replace('\\', '/') + '/' 16 | 17 | 18 | class DictionaryTest(unittest.TestCase): 19 | 20 | # noinspection HttpUrlsUsage 21 | @staticmethod 22 | def _get_dictionary(*args) -> 'MultiDictionary': 23 | """ 24 | Returns a dictionary prepared for tests. 25 | """ 26 | d = MultiDictionary(*args) 27 | 28 | # Set example pages 29 | d._test_cached_file = { 30 | 'https://www.merriam-webster.com/dictionary/good': _actualpath + 'data/mw_en_good.txt', 31 | 'https://educalingo.com/en/dic-en/good': _actualpath + 'data/educalingo_en_good.txt', 32 | 'https://www.synonym.com/synonyms/bad': _actualpath + 'data/synonyms_en_bad.txt', 33 | 'https://www.synonym.com/synonyms/good': _actualpath + 'data/synonyms_en_good.txt', 34 | 'https://www.synonym.com/synonyms/not-bad': _actualpath + 'data/synonyms_en_not-bad.txt', 35 | 'https://www.thesaurus.com/browse/for%20this%20reason': _actualpath + 'data/thesaurus-for-this-reason.txt' 36 | } 37 | 38 | return d 39 | 40 | def test_process(self) -> None: 41 | """ 42 | Test word parse before process. 43 | """ 44 | d = self._get_dictionary() 45 | 46 | # Test word parse 47 | self.assertEqual(d._process('word!!! '), 'word') 48 | self.assertEqual(d._process('invalid1'), 'invalid') 49 | self.assertEqual(d._process('multiple words'), 'multiple words') 50 | self.assertEqual(d._process('multiple!!!! words'), 'multiple words') 51 | self.assertEqual(d._process('Abstract'), 'abstract') 52 | self.assertEqual(d._process('1234Abstract'), 'abstract') 53 | self.assertEqual(d._process('1234 Abstract'), 'abstract') 54 | self.assertEqual(d._process('1234 !!! ..... Abstract'), 'abstract') 55 | self.assertEqual(d._process('word.epic'), 'word epic') 56 | self.assertEqual(d._process(' '), '') 57 | self.assertEqual(d._process('\n\n!\nthis word'), 'this word') 58 | self.assertEqual(d._process(''), 'hack') 59 | self.assertEqual(d._process('hyphen-word1111 '), 'hyphen-word') 60 | 61 | # Disable tokenize 62 | d._tokenize = False 63 | self.assertEqual(d._process(''), '') 64 | 65 | def test_meaning(self) -> None: 66 | """ 67 | Test word meaning. 68 | """ 69 | d = self._get_dictionary() 70 | 71 | ds = 'The first definition of good in the dictionary is having admirable, ' \ 72 | 'pleasing, superior, or positive qualities; not negative, bad or mediocre. ' \ 73 | 'Other definition of good is morally excellent or admirable; virtuous; ' \ 74 | 'righteous. Good is also suitable or efficient for a purpose.' 75 | wiki = 'Good may refer to: ▪ Good and evil, the distinction between positive and ' \ 76 | 'negative entities ▪ Good, objects produced for market ▪ Good ▪ Good ▪ Good, ' \ 77 | "West Virginia, USA ▪ Form of the Good, Plato's macrocosmic view of goodness " \ 78 | 'in living Expressive works: ▪ Good ▪ Good, a 2008 film starring Viggo ' \ 79 | 'Mortensen ▪ Good ▪ Good ▪ Good, by Cecil Philip Taylor Companies: ▪ Good ' \ 80 | 'Entertainment ▪ GOOD Music, a record label ▪ Good Technology Music: ▪ ' \ 81 | '"Good", a song by Better Than Ezra from Deluxe...' 82 | self.assertEqual(d.meaning('en', 'good'), (['Noun', 'Adjective', 'Exclamation'], ds, wiki)) 83 | 84 | # Test invalid link 85 | self.assertIsNone(d._bsoup('abc')) 86 | self.assertIsNone(d._bsoup('abc1234aaaaaa.com')) 87 | 88 | # Empty 89 | self.assertEqual(d.meaning('en', ''), ([], '', '')) 90 | 91 | # Test mw 92 | out = {'adjective': ['of a favorable character or tendency', 93 | 'bountiful, fertile', 94 | 'handsome, attractive', 95 | 'suitable, fit', 96 | 'free from injury or disease', 97 | 'not depreciated', 98 | 'commercially sound', 99 | 'that can be relied on', 100 | 'profitable, advantageous', 101 | 'agreeable, pleasant', 102 | 'salutary, wholesome', 103 | 'amusing, clever', 104 | 'of a noticeably large size or quantity : considerable', 105 | 'full', 106 | 'well-founded, cogent', 107 | 'true', 108 | 'deserving of respect : honorable', 109 | 'legally valid or effectual', 110 | 'adequate, satisfactory', 111 | 'conforming to a standard', 112 | 'liking only things that are of good quality : choice, ' 113 | 'discriminating', 114 | 'containing less fat and being less tender than higher grades', 115 | 'landing in the proper area of the court in tennis and similar ' 116 | 'games', 117 | 'successfully done', 118 | 'having everything desired or required : content and not ' 119 | 'wanting or needing to do anything further', 120 | 'virtuous, right, commendable', 121 | 'kind, benevolent', 122 | 'upper-class', 123 | 'competent, skillful', 124 | 'loyal', 125 | 'close', 126 | 'free from infirmity or sorrow'], 127 | 'adverb': ['well'], 128 | 'noun': ['something that is good', 129 | 'something conforming to the moral order of the universe', 130 | 'praiseworthy character : goodness', 131 | 'a good element or portion', 132 | 'advancement of prosperity or well-being', 133 | 'something useful or beneficial', 134 | 'something that has economic utility or satisfies an economic want', 135 | 'personal property having intrinsic value but usually excluding ' 136 | 'money, securities, and negotiable instruments', 137 | 'cloth', 138 | 'something manufactured or produced for sale : wares, merchandise', 139 | 'freight', 140 | 'good persons', 141 | 'the qualities required to achieve an end', 142 | 'proof of wrongdoing']} 143 | self.assertEqual(d.meaning('en', 'good', DICT_MW), out) 144 | 145 | # Test invalid dictionary 146 | self.assertRaises(InvalidDictionary, lambda: d.meaning('es', 'word', DICT_MW)) 147 | 148 | def test_translate(self) -> None: 149 | """ 150 | Test word parse before process. 151 | """ 152 | d = self._get_dictionary() 153 | 154 | # Translate 155 | tr = [('af', 'goeie'), 156 | ('ar', 'جَيِّد'), 157 | ('bn', 'ভাল'), 158 | ('de', 'gut'), 159 | ('el', 'καλός'), 160 | ('en', 'good'), 161 | ('es', 'bueno'), 162 | ('fr', 'bon'), 163 | ('hi', 'अच्छा'), 164 | ('it', 'buono'), 165 | ('ja', '良い'), 166 | ('jv', 'Apik'), 167 | ('ko', '좋은'), 168 | ('mr', 'चांगले'), 169 | ('ms', 'baik'), 170 | ('no', 'bra'), 171 | ('pl', 'dobry'), 172 | ('pt', 'bom'), 173 | ('ro', 'bun'), 174 | ('ru', 'хороший'), 175 | ('sv', 'bra'), 176 | ('ta', 'நல்ல'), 177 | ('tr', 'iyi'), 178 | ('uk', 'гарний'), 179 | ('vi', 'tốt'), 180 | ('zh', '好的')] 181 | s = d.translate('en', 'good') 182 | self.assertEqual(s, tr) 183 | self.assertIsInstance(d.translate('en', 'epic'), list) 184 | 185 | # Translate another language 186 | d.translate('en', 'Good', to='ru') 187 | 188 | # Empty 189 | self.assertEqual(d.translate('en', '!!!'), []) 190 | self.assertEqual(d.translate('en', ' !!! '), []) 191 | 192 | # Test invalid dictionary 193 | self.assertRaises(AssertionError, lambda: d.translate('es', 'word', dictionary=DICT_SYNONYMCOM)) 194 | 195 | def test_synonym(self) -> None: 196 | """ 197 | Test word synonym. 198 | """ 199 | d = self._get_dictionary() 200 | 201 | # Test thesaurus 202 | self.assertEqual(d.synonym('en', 'for this reason', DICT_THESAURUS), 203 | ['accordingly', 'so', 'then', 'thus', 'consequently', 'hence', 'thence', 'and so', 204 | 'ergo', 'for', 'forasmuch as', 'in consequence', 'in that event', 'inasmuch as', 205 | 'on account of', 'on the grounds', 'since', 'therefrom', 'thereupon', 'to that end', 'whence', 206 | 'wherefore', 'therefore', 'on that account']) 207 | 208 | # Synonyms 209 | syn = ['able', 'acceptable', 'accomplished', 'accurate', 'adept', 'adequate', 'admirable', 'adroit', 210 | 'advantage', 'advantageous', 'agreeable', 'altruistic', 'ample', 'appropriate', 'auspicious', 211 | 'authentic', 'avail', 'awesome', 'bad', 'balmy', 'barrie', 'beaut', 'behalf', 'belting', 'beneficent', 212 | 'beneficial', 'benefit', 'benevolent', 'best', 'bitchin´', 'bona fide', 'booshit', 'bright', 'calm', 213 | 'capable', 'capital', 'charitable', 'cheerful', 'choice', 'clear', 'clement', 'clever', 'cloudless', 214 | 'commendable', 'compelling', 'competent', 'complete', 'congenial', 'considerable', 'constructive', 215 | 'convenient', 'convincing', 'convivial', 'correct', 'crucial', 'decorous', 'definite', 'dependable', 216 | 'desirable', 'dexterous', 'dinkum', 'divine', 'dope', 'dutiful', 'eatable', 'edible', 'efficient', 217 | 'enjoyable', 'entire', 'estimable', 'ethical', 'exact', 'excellence', 'excellent', 'exemplary', 'exo', 218 | 'expert', 'extensive', 'fair', 'fancy', 'favourable', 'fine', 'finest', 'first-class', 'first-rate', 219 | 'fit', 'fitting', 'friendly', 'full', 'gain', 'genuine', 'goodness', 'gracious', 'gratifying', 'great', 220 | 'halcyon', 'happy', 'healthy', 'helpful', 'honest', 'honourable', 'humane', 'interest', 'judicious', 221 | 'kind', 'kind-hearted', 'kindly', 'large', 'legitimate', 'long', 'lucrative', 'mannerly', 'merciful', 222 | 'merit', 'mild', 'moral', 'morality', 'obedient', 'obliging', 'opportune', 'orderly', 'pearler', 223 | 'persuasive', 'phat', 'pleasant', 'pleasing', 'pleasurable', 'polite', 'positive', 'praiseworthy', 224 | 'precise', 'probity', 'productive', 'proficient', 'profit', 'profitable', 'proper', 'propitious', 225 | 'prudent', 'rad', 'real', 'reasonable', 'rectitude', 'reliable', 'right', 'righteous', 'righteousness', 226 | 'salubrious', 'salutary', 'satisfactory', 'satisfying', 'schmick', 'seemly', 'sensible', 'service', 227 | 'shrewd', 'sik', 'skilled', 'solid', 'sound', 'special', 'splendid', 'substantial', 'sufficient', 228 | 'suitable', 'sunny', 'sunshiny', 'super', 'superb', 'superior', 'talented', 'tasty', 'thorough', 229 | 'timely', 'tiptop', 'true', 'trustworthy', 'uncorrupted', 'untainted', 'upright', 'uprightness', 'use', 230 | 'useful', 'usefulness', 'valid', 'valuable', 'virtue', 'virtuous', 'welfare', 'well-behaved', 231 | 'well-disposed', 'well-mannered', 'well-reasoned', 'well-thought-out', 'well-timed', 'wellbeing', 232 | 'whole', 'wholesome', 'wicked', 'wise', 'world-class', 'worth', 'worthwhile', 'worthy'] 233 | self.assertEqual(d.synonym('en', 'good'), syn) 234 | self.assertIsInstance(d.synonym('en', 'epic'), list) 235 | 236 | # Define the dictionary combination 237 | self.assertEqual( 238 | d.synonym('en', 'good', DICT_SYNONYMCOM), 239 | ['great', 'nice', 'excellent', 'fine', 'well', 'quality', 'of high quality', 240 | 'of a high standard', 'superior', 'superb', 'acceptable', 'up to the mark', 'up to scratch', 241 | 'in order', 'slap-up', 'bang-up', 'cracking', 'nifty', 'neat', 'goodish', 'smashing', 242 | 'obedient', 'well-behaved', 'best', 'corking', 'respectable', 'favourable', 'not bad', 243 | 'redeeming', 'favorable', 'good enough', 'satisfactory', 'dandy', 'solid', 'keen', 'swell', 244 | 'bully', 'better', 'groovy', 'peachy', 'well behaved', 'ample', 'virtuous', 'righteous', 245 | 'moral', 'ethical', 'upright', 'upstanding', 'principled', 'exemplary', 'clean', 246 | 'goody-goody', 'saintlike', 'right', 'saintly', 'angelical', 'worthy', 'angelic', 247 | 'redemptive', 'saving', 'white', 'goodness', 'sainted', 'beatific', 'advantage', 248 | 'common good', 'vantage', 'virtue', 'righteousness', 'morality', 'uprightness', 249 | 'summum bonum', 'moral excellence', 'kindness', 'virtuousness', 'benignancy', 'graciousness', 250 | 'beneficence', 'benignity', 'honorable', 'estimable', 'beneficial', 'benefit', 'profit', 251 | 'gain', 'interest', 'welfare', 'well-being', 'enjoyment', 'wiseness', 'wisdom', 252 | 'desirability', 'worthiness', 'optimum', 'soundness']) 253 | 254 | # Test with spaces 255 | self.assertEqual( 256 | d.synonym('en', 'not bad', DICT_SYNONYMCOM), 257 | ['atrocious', 'unfavourable', 'corked', 'sad', 'horrid', 'incompetent', 'evil', 'icky', 'fearful', 258 | 'negative', 'painful', 'distressing', 'awful', 'hopeless', 'dreadful', 'terrible', 'rotten', 'rubber', 259 | 'lousy', 'severe', 'worse', 'frightful', 'hard', 'unspeakable', 'corky', 'no-good', 'unfavorable', 260 | 'crappy', 'mediocre', 'swingeing', 'tough', 'quality', 'pitiful', 'naughty', 'lamentable', 'unskilled', 261 | 'deplorable', 'worst', 'stinking', 'disobedient', 'ill', 'shitty', 'uncool', 'pretty', 'abominable', 262 | 'unsuitable', 'sorry', 'poor', 'big', 'uncomfortable', 'undesirability', 'unworthiness', 'inadvisability', 263 | 'badness', 'unsoundness', 'spoilt', 'stale']) 264 | 265 | # Invalid codes 266 | self.assertRaises(InvalidLangCode, lambda: d.synonym('unknown', 'word')) 267 | 268 | # Test invalid dictionary 269 | self.assertRaises(InvalidDictionary, lambda: d.synonym('es', 'word', DICT_SYNONYMCOM)) 270 | 271 | # Empty 272 | self.assertEqual(d.synonym('en', '!!!'), []) 273 | 274 | def test_antonym(self) -> None: 275 | """ 276 | Test antonyms. 277 | """ 278 | d = self._get_dictionary() 279 | self.assertRaises(InvalidLangCode, lambda: d.antonym('es', 'word')) 280 | 281 | # Test downloaded from bs 282 | ant = ['obedient', 'good', 'best', 'better', 'virtuous', 'morality', 283 | 'fragrant', 'unalarming', 'worthiness', 'desirability', 'advisability', 284 | 'goodness', 'asset', 'soundness', 'uncritical', 'amicable', 285 | 'complimentary', 'bold', 'supportive', 'efficient', 'courage', 286 | 'joyful', 'inoffensive', 'qualified'] 287 | self.assertEqual(d.antonym('en', 'bad'), ant) 288 | 289 | ant = ['bad', 'worse', 'unfavorable', 'unrespectable', 'worst', 290 | 'unemotionality', 'passionless', 'immoral', 'evilness', 'wicked', 291 | 'unrighteous', 'unworthy', 'wrong', 'fruitfulness', 'naivete', 292 | 'fidelity', 'worthlessness', 'malignancy', 'evil', 'maleficence', 293 | 'immorality', 'malignity', 'lowercase', 'ordinary', 'disobedience', 294 | 'domineering', 'unpropitious', 'cold', 'cool', 'unworthiness', 295 | 'badness', 'unsoundness', 'nonpregnant'] 296 | self.assertEqual(d.antonym('en', 'good'), ant) 297 | 298 | # Save soup example 299 | d._save_bsoup(f'https://www.synonym.com/synonyms/good', _actualpath + 'data/synonyms_en_good_copy.txt') 300 | 301 | # Empty 302 | self.assertEqual(d.antonym('en', '!!!'), []) 303 | 304 | def test_overwrite_cache(self) -> None: 305 | """ 306 | Test request with maxed out cache. 307 | """ 308 | d = self._get_dictionary('words', 'are', 'super', 'fun') 309 | d.set_words_lang('en') 310 | d._max_cached_websites = 3 311 | 312 | self.assertEqual(len(d.get_synonyms()), 4) 313 | # noinspection PyArgumentEqualDefault 314 | self.assertEqual(len(d.get_synonyms(dictionary=DICT_EDUCALINGO)), 4) 315 | self.assertEqual(len(d.get_synonyms(dictionary=DICT_SYNONYMCOM)), 4) 316 | self.assertEqual(len(d.get_synonyms(dictionary=DICT_THESAURUS)), 4) 317 | self.assertEqual(len(d.get_meanings(dictionary=DICT_MW)), 4) 318 | 319 | def test_language_name(self) -> None: 320 | """ 321 | Test language name. 322 | """ 323 | d = self._get_dictionary() 324 | self.assertEqual(d.get_language_name('en'), 'English') 325 | self.assertEqual(d.get_language_name('en', 'es'), 'Inglés') 326 | self.assertEqual(d.get_language_name('es'), 'Spanish') 327 | self.assertEqual(d.get_language_name('unknown'), 'Unknown') 328 | self.assertEqual(d.get_language_name('zh'), 'Chinese') 329 | 330 | def test_from_list(self) -> None: 331 | """ 332 | Test words from list. 333 | """ 334 | d = self._get_dictionary('words!', 'epic1234') 335 | self.assertEqual(d._words, ['words', 'epic']) 336 | 337 | # Lang not defined yet 338 | self.assertRaises(DictionaryLangNotDefined, lambda: d.get_synonyms()) 339 | d.set_words_lang('en') 340 | self.assertEqual(len(d.get_synonyms()), 2) 341 | self.assertEqual(len(d.get_antonyms()), 2) 342 | self.assertEqual(len(d.get_meanings()), 2) 343 | self.assertEqual(len(d.get_translations()), 2) 344 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMultiDictionary 3 | https://github.com/ppizarror/PyMultiDictionary 4 | 5 | TEST UTILS 6 | Test utils. 7 | """ 8 | 9 | import PyMultiDictionary.version 10 | # noinspection PyProtectedMember 11 | from PyMultiDictionary._utils import tokenize, get_language_name 12 | 13 | import unittest 14 | 15 | 16 | class UtilsTest(unittest.TestCase): 17 | 18 | def test_language_name(self) -> None: 19 | """ 20 | Test language name. 21 | """ 22 | self.assertEqual(get_language_name('en'), 'English') 23 | self.assertEqual(get_language_name('en', 'es'), 'Inglés') 24 | self.assertEqual(get_language_name('es'), 'Spanish') 25 | self.assertEqual(get_language_name('unknown'), 'Unknown') 26 | self.assertEqual(get_language_name('zh'), 'Chinese') 27 | 28 | def test_tokenize(self) -> None: 29 | """ 30 | Test tokenize. 31 | """ 32 | s: str = """ 33 | # ---------------------------------------------------------------------- 34 | # Settings button 35 | # ---------------------------------------------------------------------- 36 | 37 | 38 | """ 39 | t: list = [] 40 | for w in s.split(' '): 41 | tw = tokenize(w) 42 | if tw == '' or '\n' in tw: 43 | continue 44 | t.append(tw) 45 | self.assertEqual(t, ['Settings', 'button']) 46 | self.assertEqual(tokenize('hello!!___..'), 'hello') 47 | self.assertEqual(tokenize('hypen-word!!1111'), 'hypen-word') 48 | self.assertEqual(tokenize('8934205@@hypen-word!!1111'), 'hypen-word') 49 | self.assertEqual(tokenize('@@hypen-word!!1111'), 'hypen-word') 50 | self.assertEqual(tokenize('893420 5@@ hypen-word!!1111'), 'hypen-word') 51 | self.assertEqual(tokenize('893420 5@@ hypen–word!!1111'), 'hypen-word') 52 | self.assertEqual(tokenize('hyphen_word'), 'hyphen word') 53 | self.assertEqual(tokenize('__________________hyphen___word'), 'hyphen word') 54 | self.assertEqual(tokenize('__________________hyphen-word'), 'hyphen-word') 55 | self.assertEqual(tokenize('12345aaa31123'), 'aaa') 56 | self.assertEqual(tokenize('!!!'), '') 57 | self.assertEqual(tokenize('for these reasons'), 'for these reasons') 58 | self.assertEqual(tokenize('for these reasons'), 'for these reasons') 59 | 60 | def test_version(self) -> None: 61 | """ 62 | Test version. 63 | """ 64 | self.assertTrue(isinstance(PyMultiDictionary.version.ver, str)) 65 | self.assertTrue(isinstance(repr(PyMultiDictionary.version.vernum), str)) 66 | self.assertTrue(isinstance(str(PyMultiDictionary.version.vernum), str)) 67 | --------------------------------------------------------------------------------