├── __init__.py ├── .gitignore ├── LICENSE ├── thesaurus.py └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | from .thesaurus import * -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | # Mac Stuff 65 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Robert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /thesaurus.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from pprint import pprint 4 | 5 | """ 6 | I will do my best to come back and add better comments/docstrings to this file 7 | as quick as I can, but for now I think the README should suffice for your 8 | documentation. If I'm being really lazy about it though, just make an issue or 9 | email me and tell me I'm a bitch. 10 | """ 11 | 12 | def formatWordUrl(inputWord): 13 | url = 'http://www.thesaurus.com/browse/' 14 | url = url + inputWord.strip().lower().replace(' ', '%20') 15 | return url 16 | 17 | def btw(inputString, lh, rh): 18 | # extract a string between two other strings. 19 | return inputString.split(lh, 1)[1].split(rh, 1)[0] 20 | 21 | def getFilter(keyName, filters): 22 | return filters['filters'][keyName] if keyName in filters['filters'] else None 23 | 24 | def fetchWordData(inputWord): 25 | url = formatWordUrl(inputWord) 26 | r = requests.get(url) 27 | soup = BeautifulSoup(r.content, 'html.parser') 28 | 29 | definitionCount = len(soup.select("div.mask a.pos-tab")) 30 | defns = [] 31 | 32 | # part of speech and meaning 33 | posTags = soup.select("div.mask a.pos-tab") 34 | pos = [[z.text for z in x.select('em')][0] for x in posTags] 35 | meaning = [[z.text for z in x.select('strong')][0] for x in posTags] 36 | 37 | for defnNum in range(0, definitionCount): 38 | wordPath = 'div#synonyms-{} li a' 39 | data = soup.select(wordPath.format(defnNum)) 40 | 41 | curr_def = { 42 | 'partOfSpeech': pos[defnNum], 43 | 'meaning': meaning[defnNum], 44 | 'syn': [], 45 | 'ant': [] 46 | } 47 | 48 | for x in data: 49 | # tuple key is (word, relevance, length, complexity, form) 50 | entry = () 51 | category = int(btw(x.attrs['data-category'], 'relevant-', '"')) 52 | 53 | if category > 0: 54 | # the -4 is to remove the star text. I figured string manip. 55 | # would be faster than doing another select on the lower span. 56 | # I may have to change this in the future if they remove the 57 | # star thing. It works with Unicode... even though str() 58 | # doesnt. 59 | c = 'syn' 60 | entry += (x.text[:-4],) 61 | else: 62 | # antonyms don't have the star text. 63 | c = 'ant' 64 | entry += (str(x.text),) 65 | 66 | entry += (abs(category), int(x.attrs['data-length'])) 67 | entry += (int(x.attrs['data-complexity']),) 68 | try: 69 | entry += (x.attrs['class'][0],) 70 | except: 71 | entry += (None,) 72 | 73 | curr_def[c].append(entry) 74 | defns.append(curr_def) 75 | 76 | # add origin and examples to the last element so we can .pop() it out later 77 | clean = lambda x: x.strip().replace('\u201d', '"').replace('\u201c', '"') 78 | origin = [clean(x.text) for x in soup.select("div#word-origin div p")] 79 | 80 | defns.append({ 81 | 'examples': [clean(x.text) for x in soup.select("div#example-sentences div p")], 82 | 83 | # TODO: fix this, as there is a '...' that appears. Use span.oneClick-link 84 | 'origin': origin[0] if len(origin) > 0 else '' 85 | }) 86 | 87 | return defns 88 | 89 | class Word: 90 | def __init__(self, inputWord): 91 | # in case you want to visit it later 92 | self.url = formatWordUrl(inputWord) 93 | self.data = fetchWordData(inputWord) # fetch the data from thesaurus.com 94 | self.extra = self.data.pop() 95 | 96 | def __len__(self): 97 | # returns the number of definitions the word has 98 | return len(self.data) 99 | 100 | ### FUNCTIONS TO HELP ORGANIZE DATA WITHIN THE CLASS ### 101 | def filter(self, defnNum='all', **filters): 102 | """filter out our self.data to reflect only what we need/want in 103 | different functions 104 | """ 105 | if len(self) == 0: 106 | return [] 107 | 108 | # here are the available filters that we will pull out of the args. 109 | relevance = getFilter('relevance', filters) 110 | partOfSpeech = getFilter('partOfSpeech', filters) 111 | length = getFilter('length', filters) 112 | complexity = getFilter('complexity', filters) 113 | form = getFilter('form', filters) 114 | 115 | # just in-case there is some sort of user error in entering word form. 116 | if form: # make sure it's not NoneType first. 117 | if 'informal' in form.lower(): 118 | form = 'informal-word' 119 | elif 'common' in form.lower(): 120 | form = 'common-word' 121 | 122 | # we are going to assume they want to filter all of the definitions. 123 | # if not, we will need to only filter over that ONE definition number. 124 | if defnNum == 'all': 125 | startRange, endRange = 0, len(self.data) 126 | else: 127 | startRange, endRange = defnNum, defnNum+1 128 | 129 | fdata = [] # the data we are going to return 130 | 131 | options = [relevance, length, complexity, form] 132 | temp_options = list(options) 133 | made_changes = False 134 | for x in range(0, len(options)): 135 | # turn all of our inputs into list forms of said input. 136 | if type(options[x]) != list: 137 | options[x] = [options[x]] 138 | made_changes = True 139 | 140 | if not made_changes: 141 | options = temp_options # change it back to the fast and easy one. 142 | optIdx = [i for i, x in enumerate(options) if x is not None] 143 | 144 | # returns the relevant data (aka not the word) for tuple entries 145 | f = lambda x: [x[1:][z] for z in optIdx] == [options[z] for z in optIdx] 146 | 147 | for x in range(0, len(self.data)): 148 | # remember: tuple key is (word, relevance, length, complexity, 149 | # form) 150 | if (partOfSpeech == None) or (self.data[x]['partOfSpeech'] == partOfSpeech): 151 | fdata.append({ 152 | 'syn': [y for y in self.data[x]['syn'] if f(y)], 153 | 'ant': [y for y in self.data[x]['ant'] if f(y)] 154 | }) 155 | else: 156 | continue 157 | 158 | return fdata 159 | 160 | # we're SOL. Time to do it the hard'n slow way. 161 | optIdx = [i for i, x in enumerate(options) if x != [None]] 162 | options = [options[z] for z in optIdx] 163 | 164 | # tuple key is (word, relevance, length, complexity, form) 165 | for x in range(startRange, endRange): 166 | # iterate through definitions 167 | if (partOfSpeech != None) and (self.data[x]['partOfSpeech'] not in partOfSpeech): 168 | fdata.append({}) 169 | continue 170 | 171 | c_entry = {'syn': [], 'ant': []} 172 | 173 | for entry_type in ['syn', 'ant']: 174 | c_def = self.data[x] 175 | 176 | for y in range(0, len(c_def[entry_type])): 177 | # iterate through synonym entries 178 | word = [c_def[entry_type][y][1:][yy] for yy in optIdx] 179 | z, zz = 0, len(word) 180 | looksGood = True 181 | 182 | while (looksGood == True) and (z < zz): 183 | opt = word[z] 184 | looksGood = True if opt in options[z] else False 185 | z += 1 186 | 187 | if looksGood == True: 188 | c_entry[entry_type].append(c_def[entry_type][y]) 189 | 190 | fdata.append(c_entry) 191 | 192 | return fdata 193 | 194 | 195 | ### FUNCTIONS TO RETURN DATA YOU WANT ### 196 | """ 197 | Each of the following functions allow you to filter the output 198 | accordingly: relevance, partOfSpeech, length, complexity, form. 199 | """ 200 | def synonyms(self,defnNum=0,allowEmpty=True,**filters): 201 | data = [x['syn'] if 'syn' in x else [] for x in self.filter(defnNum=defnNum, filters=filters)] 202 | 203 | # the word does not exist. return empty. 204 | if not data: 205 | return [] 206 | 207 | data = [[y[0] for y in x] for x in data] 208 | 209 | if defnNum != 'all': 210 | return data[0] 211 | else: 212 | if allowEmpty == True: 213 | return data 214 | else: 215 | return [x for x in data if len(x) is not 0] 216 | 217 | def antonyms(self,defnNum=0,allowEmpty=True,**filters): 218 | data = [x['ant'] if 'ant' in x else [] for x in self.filter(defnNum=defnNum, filters=filters)] 219 | 220 | # word does not exist. return empty. 221 | if not data: 222 | return [] 223 | 224 | data = [[y[0] for y in x] for x in data] 225 | 226 | if defnNum != 'all': 227 | return data[0] 228 | else: 229 | if allowEmpty == True: 230 | return data 231 | else: 232 | return [x for x in data if len(x) is not 0] 233 | 234 | def origin(self): 235 | return self.extra['origin'] 236 | 237 | def examples(self): 238 | return self.extra['examples'] 239 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Thesaurus-API 2 | **WARNING**: thesaurus.com recently updated their css to randomize the classes and ids of everything as to prevent scraping. I'll get you guys a fix once I'm done with finals. 3 | 4 | Greetings, and welcome to the unofficial api for thesaurus.com. It is compatible with Python 2 and 3. A few of these functions were originally written for an [acronym creator](https://github.com/Manwholikespie/backronym), however I figured other people might want to pull data from thesaurus.com. 5 | 6 | ## Introduction 7 | With the thesaurus-api, you are able to grab synonyms and antonyms from thesaurus.com. Thanks to the way the website highlights synonym/antonym entries in different colors according to their relevance, I have also included functions to grab certain ranks of syn/ant entries according to the level of relevance you require. 8 | 9 | Within the thesaurus class's primary class, `Word`, there are four functions: 10 | - `Word.synonyms()` : returns a filterable list of the word's synonyms. 11 | - `Word.antonyms()` : returns a filterable list of the word's antonyms. 12 | - `Word.origin()` : returns the origin of the word (according to Thesaurus.com). 13 | - `Word.examples()` : returns sentences showing how the word is used. 14 | 15 | More information is provided about these functions in the *Getting Started* section below. 16 | 17 | ## License 18 | Everything in here is licensed under the MIT license. Do with it what you want– make some money. Just don't get me involved. 19 | 20 | ## Getting Started 21 | First, download the program. 22 | `git clone https://github.com/Manwholikespie/thesaurus-api` 23 | 24 | Then, install its dependencies. 25 | `pip install requests` 26 | `pip install beautifulsoup4` 27 | 28 | Next, navigate to its directory and launch python. 29 | 30 | ```bash 31 | $ cd thesaurus-api/ 32 | $ python 33 | ``` 34 | 35 | In python, the syntax is fairly simple. You begin by importing and creating a `Word` class. 36 | 37 | ```python 38 | >>> from thesaurus import Word 39 | >>> myWord = Word('box') 40 | ``` 41 | From here, if you wish to get the word's synonyms, you can use the `.synonyms()` function. 42 | *Note: All of the following information about this function also applies to its inverse, `.antonyms()`* 43 | 44 | ```python 45 | >>> myWord.synonyms() 46 | [u'carton', u'crate', u'pack', u'trunk', u'package', u'case', u'bin', u'casket', u'chest', u'coffer', u'portmanteau', u'receptacle'] 47 | ``` 48 | This will get you the all of the synonyms under the word's first definition. To see how many definitions a word has, you can measure its length. 49 | 50 | ```python 51 | >>> len(myWord) 52 | 3 53 | ``` 54 | The index of its definitions begins at 0, so to get the synonyms for the second definition, you would use: 55 | 56 | ```python 57 | >>> myWord.synonyms(1) 58 | [u'wrap', u'pack', u'case', u'crate', u'confine', u'package', u'encase'] 59 | ``` 60 | If you used a 0 instead of a 1, you would get the same data as in the first example. If you want to get a list of all the synonyms, but still separated by their definition, you would use 'all'. 61 | 62 | ```python 63 | >>> myWord.synonyms('all') 64 | [[u'carton', 65 | u'crate', 66 | u'pack', 67 | u'trunk', 68 | u'package', 69 | u'case', 70 | u'bin', 71 | u'casket', 72 | u'chest', 73 | u'coffer', 74 | u'portmanteau', 75 | u'receptacle'], 76 | [u'wrap', u'pack', u'case', u'crate', u'confine', u'package', u'encase'], 77 | [u'slug', 78 | u'hit', 79 | u'mix', 80 | u'buffet', 81 | u'scrap', 82 | u'sock', 83 | u'slap', 84 | u'strike', 85 | u'cuff', 86 | u'clout', 87 | u'wallop', 88 | u'spar', 89 | u'whack', 90 | u'duke', 91 | u'exchange blows']] 92 | ``` 93 | This is a lot of data, though, and we may not need all of it. Say you want to filter through the first definition for your word and find words that are of relevance 3. 94 | 95 | ```python 96 | >>> myWord.synonyms(relevance=3) 97 | [u'carton', u'crate', u'pack', u'trunk', u'package'] 98 | ``` 99 | But maybe you want a bit more data, and you aren't being too strict on relevance, so you could settle for a few level 2's in there. The following will include both relevance 2 and 3. 100 | 101 | ```python 102 | >>> myWord.synonyms(relevance=[2,3]) 103 | [u'carton', u'crate', u'pack', u'trunk', u'package', u'case', u'bin', u'casket', u'chest', u'coffer', u'portmanteau', u'receptacle'] 104 | ``` 105 | 106 | This API allows for quite a bit of fun filtering options. If we wanted to look through all of the definitions of the word 'old', and find words which are complex, lengthy, but still have good relevance: 107 | 108 | ```python 109 | >>> Word('old').synonyms('all',relevance=[2,3], complexity=[2,3], length=3) 110 | [[], [u'old-fashioned', u'antediluvian'], []] 111 | ``` 112 | 113 | You can also search strictly for results that are `'common'` or `'informal'`. Please note that common does not infer not informal. The majority of words are neither common nor informal. 114 | 115 | ```python 116 | >>> Word('old').synonyms('all', form='informal') 117 | [[u'hoary', u'wasted'], [u'hackneyed'], []] 118 | >>> Word('old').synonyms(1,form='common') 119 | [u'old-fashioned', 120 | u'former', 121 | u'traditional', 122 | u'original', 123 | u'past', 124 | u'remote', 125 | u'dated', 126 | u'done', 127 | u'early', 128 | u'late', 129 | u'once', 130 | u'sometime'] 131 | ``` 132 | 133 | Finally, you can search by a definition's part-of-speech. The available options are: 134 | - `'noun'` 135 | - `'verb'` 136 | - `'adj'` 137 | - `'adv'` 138 | - `'as in'` (usually for prounouns or interjections) 139 | - `'prep'` 140 | - `'conjunction'` 141 | 142 | When using the `partOfSpeech` filter, it is important to use `'all'`, otherwise you will get nothing in the case that a definition's first definition is not your same partOfSpeech. 143 | 144 | ```python 145 | >>> Word('box').synonyms('all', partOfSpeech='noun') 146 | [[u'carton', 147 | u'crate', 148 | u'pack', 149 | u'trunk', 150 | u'package', 151 | u'case', 152 | u'bin', 153 | u'casket', 154 | u'chest', 155 | u'coffer', 156 | u'portmanteau', 157 | u'receptacle'], 158 | [], 159 | []] 160 | >>> Word('box').synonyms('all', partOfSpeech='verb') 161 | [[], 162 | [u'wrap', u'pack', u'case', u'crate', u'confine', u'package', u'encase'], 163 | [u'slug', 164 | u'hit', 165 | u'mix', 166 | u'buffet', 167 | u'scrap', 168 | u'sock', 169 | u'slap', 170 | u'strike', 171 | u'cuff', 172 | u'clout', 173 | u'wallop', 174 | u'spar', 175 | u'whack', 176 | u'duke', 177 | u'exchange blows']] 178 | ``` 179 | If you do not want to keep the empty definition results in there, you can use `allowEmpty=False` when making your search: 180 | 181 | ```python 182 | >>> Word('box').synonyms('all', partOfSpeech='noun', allowEmpty=False) 183 | [[u'carton', 184 | u'crate', 185 | u'pack', 186 | u'trunk', 187 | u'package', 188 | u'case', 189 | u'bin', 190 | u'casket', 191 | u'chest', 192 | u'coffer', 193 | u'portmanteau', 194 | u'receptacle']] 195 | ``` 196 | 197 | To recap, the available filtering options and their parameters are: 198 | 199 | ```python 200 | relevance=[1,2,3] 201 | length=[1,2,3] 202 | complexity=[1,2,3] 203 | partOfSpeech=['verb','noun','adj','adv','as in','conjunction'] 204 | form=['common','informal'] 205 | ``` 206 | 207 | If you want to filter the data in your own way, you can access the raw word data (it's in tuple form... you can see they key in the thesaurus.py) by calling `.data` on the Word instance. 208 | 209 | As for the other functions, 210 | 211 | ```python 212 | >>> myWord = Word('kettle') 213 | >>> myWord.origin() 214 | u'kettle O.E. cetil (Mercian), from L. catillus "deep pan or dish for cooking," dim. of catinus "bowl, dish, pot." A general Gmc. borrowing (cf. O.S. ketel, O.Fris. zetel, M.Du. ketel, O.H.G. kezzil, Ger. Kessel). Spelling with a -k- (c.1300) probably is from infl. of O.N. cognate ketill. The smaller sense of "tea-kettle" is 20c. Kettledrum is from 1542.' 215 | >>> myWord.examples() 216 | [u'Agnes was bending with red eyes over a kettle which was boiling on the fire.', 217 | u'She insisted on making tea, and was too quick with the kettle for Edward to help her.', 218 | u'The hot ascending current passes close by the metal sides of the kettle; while the cold descending current passes down the centre.', 219 | u'This is distilled water, and is purer than that in the kettle.', 220 | u'In the winter Snow-white lighted the fire, and put the kettle on, after scouring it, so that it resembled gold in brightness.', 221 | u'That remarkable change of attitude of his now included the kettle.', 222 | u"And as Dick gracefully reminds me, the pot can't call the kettle black.", 223 | u'Take them from the kettle, drain, and brown with butter, salt and pepper.', 224 | u'When Bill Haden returned from work he found the room done up, the table laid for tea, and the kettle on the fire.', 225 | u'But, to his surprise, no tanuki was there, nothing but the kettle he had found in the corner.'] 226 | ``` 227 | 228 | ## Coming Soon 229 | ~~Make a findWord(inputWord) function that will return both synonyms and antonyms of individual ranks into a dictionary.~~ 230 | 231 | ~~A Function that allows you to search for the synonyms/antonyms of a different definition of the word you are searching for (right now those are hidden in different tabs, but I should be able to fix that by changing the beautifulsoup selector to div#synonyms-[1,2,3, etc.].~~ 232 | 233 | ~~Make a class that allows us to call anything we want from it more easily. I want to just specify a word class with the only input being the word, and then call word.synonyms, word.origin, etc.~~ 234 | 235 | ~~Come up with a more organized way of naming the functions so that I don't confuse people.~~ 236 | 237 | In addition to having a ['meaning'] part of each definition's dictionary when using findWordTotal, add a ['nltk meaning'] section so that it plays nicely with nltk's part-of-speech tagger. 238 | 239 | Add automated tests and badges to show supported versions of Python, and detect any errors. 240 | 241 | ## Special Thanks 242 | To [James](https://github.com/jaykm/) for the idea to just use rstrip() instead of something much more complicated to single-out an entry's relevanceLevel. 243 | 244 | To [Kyle](https://github.com/AFishNamedFish) for his interest in this project. You rock, Kyle. 245 | 246 | To [Stefano](https://github.com/stefano-bragaglia) for suggesting that I add filtering to function output. 247 | 248 | To [Suhas](https://github.com/syelluru) for correcting my errors. 249 | --------------------------------------------------------------------------------