├── .gitignore ├── LICENSE.md ├── README.md └── dictionary_crawler ├── dictionary_crawler ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ └── __init__.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/linux,python,pycharm+iml,pycharm+all 2 | # Edit at https://www.gitignore.io/?templates=linux,python,pycharm+iml,pycharm+all 3 | 4 | ### Linux ### 5 | *~ 6 | 7 | # temporary files which can be created if a process still has a handle open of a deleted file 8 | .fuse_hidden* 9 | 10 | # KDE directory preferences 11 | .directory 12 | 13 | # Linux trash folder which might appear on any partition or disk 14 | .Trash-* 15 | 16 | # .nfs files are created when an open file is removed but is still being accessed 17 | .nfs* 18 | 19 | ### PyCharm+all ### 20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 22 | 23 | # User-specific stuff 24 | .idea/**/workspace.xml 25 | .idea/**/tasks.xml 26 | .idea/**/usage.statistics.xml 27 | .idea/**/dictionaries 28 | .idea/**/shelf 29 | 30 | # Generated files 31 | .idea/**/contentModel.xml 32 | 33 | # Sensitive or high-churn files 34 | .idea/**/dataSources/ 35 | .idea/**/dataSources.ids 36 | .idea/**/dataSources.local.xml 37 | .idea/**/sqlDataSources.xml 38 | .idea/**/dynamic.xml 39 | .idea/**/uiDesigner.xml 40 | .idea/**/dbnavigator.xml 41 | 42 | # Gradle 43 | .idea/**/gradle.xml 44 | .idea/**/libraries 45 | 46 | # Gradle and Maven with auto-import 47 | # When using Gradle or Maven with auto-import, you should exclude module files, 48 | # since they will be recreated, and may cause churn. Uncomment if using 49 | # auto-import. 50 | # .idea/modules.xml 51 | # .idea/*.iml 52 | # .idea/modules 53 | # *.iml 54 | # *.ipr 55 | 56 | # CMake 57 | cmake-build-*/ 58 | 59 | # Mongo Explorer plugin 60 | .idea/**/mongoSettings.xml 61 | 62 | # File-based project format 63 | *.iws 64 | 65 | # IntelliJ 66 | out/ 67 | 68 | # mpeltonen/sbt-idea plugin 69 | .idea_modules/ 70 | 71 | # JIRA plugin 72 | atlassian-ide-plugin.xml 73 | 74 | # Cursive Clojure plugin 75 | .idea/replstate.xml 76 | 77 | # Crashlytics plugin (for Android Studio and IntelliJ) 78 | com_crashlytics_export_strings.xml 79 | crashlytics.properties 80 | crashlytics-build.properties 81 | fabric.properties 82 | 83 | # Editor-based Rest Client 84 | .idea/httpRequests 85 | 86 | # Android studio 3.1+ serialized cache file 87 | .idea/caches/build_file_checksums.ser 88 | 89 | ### PyCharm+all Patch ### 90 | # Ignores the whole .idea folder and all .iml files 91 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 92 | 93 | .idea/ 94 | 95 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 96 | 97 | *.iml 98 | modules.xml 99 | .idea/misc.xml 100 | *.ipr 101 | 102 | # Sonarlint plugin 103 | .idea/sonarlint 104 | 105 | ### PyCharm+iml ### 106 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 107 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 108 | 109 | # User-specific stuff 110 | 111 | # Generated files 112 | 113 | # Sensitive or high-churn files 114 | 115 | # Gradle 116 | 117 | # Gradle and Maven with auto-import 118 | # When using Gradle or Maven with auto-import, you should exclude module files, 119 | # since they will be recreated, and may cause churn. Uncomment if using 120 | # auto-import. 121 | # .idea/modules.xml 122 | # .idea/*.iml 123 | # .idea/modules 124 | # *.iml 125 | # *.ipr 126 | 127 | # CMake 128 | 129 | # Mongo Explorer plugin 130 | 131 | # File-based project format 132 | 133 | # IntelliJ 134 | 135 | # mpeltonen/sbt-idea plugin 136 | 137 | # JIRA plugin 138 | 139 | # Cursive Clojure plugin 140 | 141 | # Crashlytics plugin (for Android Studio and IntelliJ) 142 | 143 | # Editor-based Rest Client 144 | 145 | # Android studio 3.1+ serialized cache file 146 | 147 | ### PyCharm+iml Patch ### 148 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 149 | 150 | 151 | ### Python ### 152 | # Byte-compiled / optimized / DLL files 153 | __pycache__/ 154 | *.py[cod] 155 | *$py.class 156 | 157 | # C extensions 158 | *.so 159 | 160 | # Distribution / packaging 161 | .Python 162 | build/ 163 | develop-eggs/ 164 | dist/ 165 | downloads/ 166 | eggs/ 167 | .eggs/ 168 | lib/ 169 | lib64/ 170 | parts/ 171 | sdist/ 172 | var/ 173 | wheels/ 174 | pip-wheel-metadata/ 175 | share/python-wheels/ 176 | *.egg-info/ 177 | .installed.cfg 178 | *.egg 179 | MANIFEST 180 | 181 | # PyInstaller 182 | # Usually these files are written by a python script from a template 183 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 184 | *.manifest 185 | *.spec 186 | 187 | # Installer logs 188 | pip-log.txt 189 | pip-delete-this-directory.txt 190 | 191 | # Unit test / coverage reports 192 | htmlcov/ 193 | .tox/ 194 | .nox/ 195 | .coverage 196 | .coverage.* 197 | .cache 198 | nosetests.xml 199 | coverage.xml 200 | *.cover 201 | .hypothesis/ 202 | .pytest_cache/ 203 | 204 | # Translations 205 | *.mo 206 | *.pot 207 | 208 | # Django stuff: 209 | *.log 210 | local_settings.py 211 | db.sqlite3 212 | db.sqlite3-journal 213 | 214 | # Flask stuff: 215 | instance/ 216 | .webassets-cache 217 | 218 | # Scrapy stuff: 219 | .scrapy 220 | 221 | # Sphinx documentation 222 | docs/_build/ 223 | 224 | # PyBuilder 225 | target/ 226 | 227 | # Jupyter Notebook 228 | .ipynb_checkpoints 229 | 230 | # IPython 231 | profile_default/ 232 | ipython_config.py 233 | 234 | # pyenv 235 | .python-version 236 | 237 | # pipenv 238 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 239 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 240 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 241 | # install all needed dependencies. 242 | #Pipfile.lock 243 | 244 | # celery beat schedule file 245 | celerybeat-schedule 246 | 247 | # SageMath parsed files 248 | *.sage.py 249 | 250 | # Environments 251 | .env 252 | .venv 253 | env/ 254 | venv/ 255 | ENV/ 256 | env.bak/ 257 | venv.bak/ 258 | 259 | # Spyder project settings 260 | .spyderproject 261 | .spyproject 262 | 263 | # Rope project settings 264 | .ropeproject 265 | 266 | # mkdocs documentation 267 | /site 268 | 269 | # mypy 270 | .mypy_cache/ 271 | .dmypy.json 272 | dmypy.json 273 | 274 | # Pyre type checker 275 | .pyre/ 276 | 277 | # End of https://www.gitignore.io/api/linux,python,pycharm+iml,pycharm+all -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | MIT License 4 | 5 | Copyright (c) 2019 Peyman Mohseni Kiasari 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](https://st.hzcdn.com/simgs/93e15c6309f22aa9_4-3114/home-design.jpg) 2 | 3 | Art by:[Altered Artichoke](https://alteredartichoke.com) 4 | 5 | 6 | # Dictionary Crawler 7 | This is a python code based on Scrapy package to crawl famous online dictionaries like Oxford, Longman, Cambridge, Webster, and Collins. 8 | Be aware that this code is published on July 21, 2019, and if those sites will be updated in the future this code may not work properly. 9 | ## Prerequisites 10 | Scrapy python package 11 | 12 | ## How to use it? 13 | ##### 1- set words: 14 | at *dictionary_crawler/dictionary_crawler/dictionary_crawler/spiders* there is a **\__init__.py** and there is a list named **words** that is initialize like this: 15 | ```python 16 | words = ['I', 'hope', 'you', 'like', 'this', 'dictionary', 'web', 'crawler'] 17 | ``` 18 | you can change this list to crawl the words that you want. 19 | ##### 2- run: 20 | go to the file *dictionary_crawler/dictionary_crawler/dictionary_crawler/spiders* and run this in the terminal 21 | 22 | scrapy crawl [name of dictionary] -o [name of file to write into].jl 23 | 24 | for example for Oxford dictionary: 25 | 26 | scrapy crawl oxford -o oxford.jl 27 | and the data will be stored in oxford.jl at *dictionary_crawler/dictionary_crawler/dictionary_crawler/spiders* 28 | 29 | ## How is the output? 30 | The output is a JSON Lines file format that each line of it is a python dictionary with a word and definitions of it. 31 | for example, the word **hope** in the Webster dictionary is like this: 32 | ```json 33 | {"hope": 34 | { 35 | "verb": ["to cherish a desire with anticipation to want something to happen or be true"], 36 | "noun": ["to desire with expectation of obtainment or fulfillment", "to expect with confidence", "city in southwestern Arkansas that was the childhood home of President Bill Clinton population 10,095"], 37 | "biographical name": ["desire accompanied by expectation of or belief in fulfillment", "expectation of fulfillment or success", "someone or something on which hopes are centered", "something desired or hoped (see 1) for", "Anthony", "Bob 1903–2003 originally Leslie Townes Hope American (British-born) comedian"], 38 | "geographical name": ["Victor Alexander John 1887–1951 2nd Marquis of", "British soldier; viceroy of India (1936–43)"] 39 | } 40 | } 41 | ``` 42 | ## Author 43 | 44 | * **Peyman Mohseni kiasari** 45 | 46 | ## License 47 | 48 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details. 49 | -------------------------------------------------------------------------------- /dictionary_crawler/dictionary_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kiasar/Dictionary_crawler/25d2b6203b64ee4d8a4b8fad83ebfe5d3fb615a0/dictionary_crawler/dictionary_crawler/__init__.py -------------------------------------------------------------------------------- /dictionary_crawler/dictionary_crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DictionaryCrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /dictionary_crawler/dictionary_crawler/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DictionaryCrawlerSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class DictionaryCrawlerDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /dictionary_crawler/dictionary_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DictionaryCrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /dictionary_crawler/dictionary_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dictionary_crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dictionary_crawler' 13 | 14 | SPIDER_MODULES = ['dictionary_crawler.spiders'] 15 | NEWSPIDER_MODULE = 'dictionary_crawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'dictionary_crawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | FEED_EXPORT_ENCODING = "utf-8" 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | CONCURRENT_REQUESTS = 512 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 0.0 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'dictionary_crawler.middlewares.DictionaryCrawlerSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'dictionary_crawler.middlewares.DictionaryCrawlerDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'dictionary_crawler.pipelines.DictionaryCrawlerPipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /dictionary_crawler/dictionary_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import scrapy 4 | 5 | # by Peyman (mohsenikiasari@ce.sharif.edu) in 2019. 6 | 7 | words = ['I', 'hope', 'you', 'like', 'this', 'dictionary', 'web', 'crawler'] 8 | 9 | # scrapy crawl oxford -o oxford.jl 10 | class OxfordCrawler(scrapy.Spider): 11 | name = "oxford" 12 | allowed_domains = ["www.lexico.com"] 13 | start_urls = ["https://www.lexico.com/en/definition/" + word for word in words] 14 | 15 | def parse(self, response): 16 | word = response.request.url.split("/")[-1] 17 | definition_dict = {} 18 | 19 | for sections in response.xpath("//section[@class='gramb']"): 20 | try: 21 | part_of_speech = sections.xpath(".//span[@class='pos']/text()").extract()[0] 22 | except: 23 | part_of_speech = False 24 | def_list = sections.xpath("./ul/li/div[@class='trg']//span[@class='ind']").extract() 25 | if not def_list: 26 | def_list = sections.xpath(".//div[@class='empty_sense']//div[@class='crossReference']").extract() 27 | 28 | def_list = [re.sub(r'<.*?>', "", i).strip() for i in def_list] 29 | def_list = [i for i in def_list if i] 30 | 31 | if def_list and part_of_speech: 32 | if part_of_speech in definition_dict: 33 | definition_dict[part_of_speech] += def_list 34 | else: 35 | definition_dict[part_of_speech] = def_list 36 | 37 | if definition_dict: 38 | yield {word: definition_dict} 39 | 40 | 41 | # scrapy crawl longman -o longman.jl 42 | class LongmanCrawler(scrapy.Spider): 43 | name = "longman" 44 | allowed_domains = ["https://www.ldoceonline.com"] 45 | start_urls = ["https://www.ldoceonline.com/dictionary/" + word for word in words] 46 | 47 | def parse(self, response): 48 | word = response.request.url.split("/")[-1] 49 | definition_dict = {} 50 | 51 | for sections in response.xpath("//span[@class='dictentry']"): 52 | try: 53 | part_of_speech = (sections.xpath(".//span[@class='POS']/text()").extract()[0]).strip() 54 | except: 55 | part_of_speech = False 56 | def_list = sections.xpath(".//span[@class='Sense']/span[@class='DEF']").extract() 57 | def_list = [re.sub(r'<.*?>', "", i[18:-7]).strip() for i in def_list] 58 | def_list = [i for i in def_list if i] 59 | 60 | if def_list and part_of_speech: 61 | if part_of_speech in definition_dict: 62 | definition_dict[part_of_speech] += def_list 63 | else: 64 | definition_dict[part_of_speech] = def_list 65 | 66 | if definition_dict: 67 | yield {word: definition_dict} 68 | 69 | 70 | # scrapy crawl cambridge -o cambridge.jl 71 | class CambridgeCrawler(scrapy.Spider): 72 | name = "cambridge" 73 | allowed_domains = ["https://dictionary.cambridge.org"] 74 | start_urls = ["https://dictionary.cambridge.org/dictionary/english/" + word for word in words] 75 | 76 | def parse(self, response): 77 | word = response.request.url.split("/")[-1] 78 | definition_dict = {} 79 | 80 | for enrty in response.xpath("//div[@class='entry-body__el clrd js-share-holder']"): 81 | part_of_speeches = enrty.xpath("./div[@class='pos-header']//span[@class='pos']/text()").extract() 82 | def_list = enrty.xpath( 83 | ".//div[@class='sense-body']/div[@class='def-block pad-indent']//b[@class='def']").extract() 84 | def_list = [re.sub(r'<.*?>|:', "", i[15:-4]).strip() for i in def_list] 85 | def_list = [i for i in def_list if i] 86 | 87 | if def_list and part_of_speech: 88 | for part_of_speech in part_of_speeches: 89 | if part_of_speech in definition_dict: 90 | definition_dict[part_of_speech] += def_list 91 | else: 92 | definition_dict[part_of_speech] = def_list 93 | 94 | if definition_dict: 95 | yield {word: definition_dict} 96 | 97 | 98 | # scrapy crawl webster -o webster.jl 99 | class WebsterCrawler(scrapy.Spider): 100 | name = "webster" 101 | allowed_domains = ["https://www.merriam-webster.com"] 102 | start_urls = ["https://www.merriam-webster.com/dictionary/" + word for word in words] 103 | 104 | def parse(self, response): 105 | word = response.request.url.split("/")[-1] 106 | definition_dict = {} 107 | 108 | part_of_speeches = [re.sub(r'\(.*\)', "", i).strip() for i in 109 | response.xpath("//span[@class='fl']/a/text()|//span[@class='fl']/text()").extract()] 110 | 111 | for sections in response.xpath("//div[contains(@id, 'dictionary-entry')]/div[@class='vg']"): 112 | part_of_speech = part_of_speeches.pop(0) 113 | def_list = sections.xpath( 114 | ".//span[@class='dtText' or @class='unText'][not(ancestor::span[@class='dtText'])]").extract() 115 | def_list = [re.sub(r'.+', "", i[21:-7]) for i in def_list] 116 | def_list = [re.sub(r'<.*?>|:', "", i).strip() for i in def_list] 117 | def_list = [i for i in def_list if i] 118 | 119 | if def_list and part_of_speech: 120 | if part_of_speech in definition_dict: 121 | definition_dict[part_of_speech] += def_list 122 | else: 123 | definition_dict[part_of_speech] = def_list 124 | 125 | if definition_dict: 126 | yield {word: definition_dict} 127 | 128 | 129 | # scrapy crawl collins -o collins.jl 130 | class CollinsCrawler(scrapy.Spider): 131 | name = "collins" 132 | allowed_domains = ["https://www.collinsdictionary.com"] 133 | start_urls = ["https://www.collinsdictionary.com/dictionary/english/" + word for word in words] 134 | 135 | def parse(self, response): 136 | word = response.request.url.split("/")[-1] 137 | definition_dict = {} 138 | 139 | for sections in response.xpath("//div[@class='dictionary Cob_Adv_Brit']" 140 | "//div[@class='content definitions cobuild br']/div[@class='hom']"): 141 | try: 142 | part_of_speech = (sections.xpath(".//span[@class='pos']/text()").extract()[0]).strip() 143 | except: 144 | part_of_speech = False 145 | def_list = sections.xpath("./div[@class='sense']/div[@class='def']").extract() 146 | def_list = [re.sub(r'<.*?>', "", i[17:-6]).strip() for i in def_list] 147 | def_list = [i for i in def_list if i] 148 | 149 | if def_list and part_of_speech: 150 | if part_of_speech in definition_dict: 151 | definition_dict[part_of_speech] += def_list 152 | else: 153 | definition_dict[part_of_speech] = def_list 154 | 155 | if definition_dict: 156 | yield {word: definition_dict} 157 | -------------------------------------------------------------------------------- /dictionary_crawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dictionary_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dictionary_crawler 12 | --------------------------------------------------------------------------------