├── .gitignore
├── LICENSE.md
├── README.md
└── dictionary_crawler
    ├── dictionary_crawler
        ├── __init__.py
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
        │   └── __init__.py
    └── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/linux,python,pycharm+iml,pycharm+all
  2 | # Edit at https://www.gitignore.io/?templates=linux,python,pycharm+iml,pycharm+all
  3 | 
  4 | ### Linux ###
  5 | *~
  6 | 
  7 | # temporary files which can be created if a process still has a handle open of a deleted file
  8 | .fuse_hidden*
  9 | 
 10 | # KDE directory preferences
 11 | .directory
 12 | 
 13 | # Linux trash folder which might appear on any partition or disk
 14 | .Trash-*
 15 | 
 16 | # .nfs files are created when an open file is removed but is still being accessed
 17 | .nfs*
 18 | 
 19 | ### PyCharm+all ###
 20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 22 | 
 23 | # User-specific stuff
 24 | .idea/**/workspace.xml
 25 | .idea/**/tasks.xml
 26 | .idea/**/usage.statistics.xml
 27 | .idea/**/dictionaries
 28 | .idea/**/shelf
 29 | 
 30 | # Generated files
 31 | .idea/**/contentModel.xml
 32 | 
 33 | # Sensitive or high-churn files
 34 | .idea/**/dataSources/
 35 | .idea/**/dataSources.ids
 36 | .idea/**/dataSources.local.xml
 37 | .idea/**/sqlDataSources.xml
 38 | .idea/**/dynamic.xml
 39 | .idea/**/uiDesigner.xml
 40 | .idea/**/dbnavigator.xml
 41 | 
 42 | # Gradle
 43 | .idea/**/gradle.xml
 44 | .idea/**/libraries
 45 | 
 46 | # Gradle and Maven with auto-import
 47 | # When using Gradle or Maven with auto-import, you should exclude module files,
 48 | # since they will be recreated, and may cause churn.  Uncomment if using
 49 | # auto-import.
 50 | # .idea/modules.xml
 51 | # .idea/*.iml
 52 | # .idea/modules
 53 | # *.iml
 54 | # *.ipr
 55 | 
 56 | # CMake
 57 | cmake-build-*/
 58 | 
 59 | # Mongo Explorer plugin
 60 | .idea/**/mongoSettings.xml
 61 | 
 62 | # File-based project format
 63 | *.iws
 64 | 
 65 | # IntelliJ
 66 | out/
 67 | 
 68 | # mpeltonen/sbt-idea plugin
 69 | .idea_modules/
 70 | 
 71 | # JIRA plugin
 72 | atlassian-ide-plugin.xml
 73 | 
 74 | # Cursive Clojure plugin
 75 | .idea/replstate.xml
 76 | 
 77 | # Crashlytics plugin (for Android Studio and IntelliJ)
 78 | com_crashlytics_export_strings.xml
 79 | crashlytics.properties
 80 | crashlytics-build.properties
 81 | fabric.properties
 82 | 
 83 | # Editor-based Rest Client
 84 | .idea/httpRequests
 85 | 
 86 | # Android studio 3.1+ serialized cache file
 87 | .idea/caches/build_file_checksums.ser
 88 | 
 89 | ### PyCharm+all Patch ###
 90 | # Ignores the whole .idea folder and all .iml files
 91 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
 92 | 
 93 | .idea/
 94 | 
 95 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 96 | 
 97 | *.iml
 98 | modules.xml
 99 | .idea/misc.xml
100 | *.ipr
101 | 
102 | # Sonarlint plugin
103 | .idea/sonarlint
104 | 
105 | ### PyCharm+iml ###
106 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
107 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
108 | 
109 | # User-specific stuff
110 | 
111 | # Generated files
112 | 
113 | # Sensitive or high-churn files
114 | 
115 | # Gradle
116 | 
117 | # Gradle and Maven with auto-import
118 | # When using Gradle or Maven with auto-import, you should exclude module files,
119 | # since they will be recreated, and may cause churn.  Uncomment if using
120 | # auto-import.
121 | # .idea/modules.xml
122 | # .idea/*.iml
123 | # .idea/modules
124 | # *.iml
125 | # *.ipr
126 | 
127 | # CMake
128 | 
129 | # Mongo Explorer plugin
130 | 
131 | # File-based project format
132 | 
133 | # IntelliJ
134 | 
135 | # mpeltonen/sbt-idea plugin
136 | 
137 | # JIRA plugin
138 | 
139 | # Cursive Clojure plugin
140 | 
141 | # Crashlytics plugin (for Android Studio and IntelliJ)
142 | 
143 | # Editor-based Rest Client
144 | 
145 | # Android studio 3.1+ serialized cache file
146 | 
147 | ### PyCharm+iml Patch ###
148 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
149 | 
150 | 
151 | ### Python ###
152 | # Byte-compiled / optimized / DLL files
153 | __pycache__/
154 | *.py[cod]
155 | *$py.class
156 | 
157 | # C extensions
158 | *.so
159 | 
160 | # Distribution / packaging
161 | .Python
162 | build/
163 | develop-eggs/
164 | dist/
165 | downloads/
166 | eggs/
167 | .eggs/
168 | lib/
169 | lib64/
170 | parts/
171 | sdist/
172 | var/
173 | wheels/
174 | pip-wheel-metadata/
175 | share/python-wheels/
176 | *.egg-info/
177 | .installed.cfg
178 | *.egg
179 | MANIFEST
180 | 
181 | # PyInstaller
182 | #  Usually these files are written by a python script from a template
183 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
184 | *.manifest
185 | *.spec
186 | 
187 | # Installer logs
188 | pip-log.txt
189 | pip-delete-this-directory.txt
190 | 
191 | # Unit test / coverage reports
192 | htmlcov/
193 | .tox/
194 | .nox/
195 | .coverage
196 | .coverage.*
197 | .cache
198 | nosetests.xml
199 | coverage.xml
200 | *.cover
201 | .hypothesis/
202 | .pytest_cache/
203 | 
204 | # Translations
205 | *.mo
206 | *.pot
207 | 
208 | # Django stuff:
209 | *.log
210 | local_settings.py
211 | db.sqlite3
212 | db.sqlite3-journal
213 | 
214 | # Flask stuff:
215 | instance/
216 | .webassets-cache
217 | 
218 | # Scrapy stuff:
219 | .scrapy
220 | 
221 | # Sphinx documentation
222 | docs/_build/
223 | 
224 | # PyBuilder
225 | target/
226 | 
227 | # Jupyter Notebook
228 | .ipynb_checkpoints
229 | 
230 | # IPython
231 | profile_default/
232 | ipython_config.py
233 | 
234 | # pyenv
235 | .python-version
236 | 
237 | # pipenv
238 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
239 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
240 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
241 | #   install all needed dependencies.
242 | #Pipfile.lock
243 | 
244 | # celery beat schedule file
245 | celerybeat-schedule
246 | 
247 | # SageMath parsed files
248 | *.sage.py
249 | 
250 | # Environments
251 | .env
252 | .venv
253 | env/
254 | venv/
255 | ENV/
256 | env.bak/
257 | venv.bak/
258 | 
259 | # Spyder project settings
260 | .spyderproject
261 | .spyproject
262 | 
263 | # Rope project settings
264 | .ropeproject
265 | 
266 | # mkdocs documentation
267 | /site
268 | 
269 | # mypy
270 | .mypy_cache/
271 | .dmypy.json
272 | dmypy.json
273 | 
274 | # Pyre type checker
275 | .pyre/
276 | 
277 | # End of https://www.gitignore.io/api/linux,python,pycharm+iml,pycharm+all


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | MIT License
 4 | 
 5 | Copyright (c) 2019 Peyman Mohseni Kiasari
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![](https://st.hzcdn.com/simgs/93e15c6309f22aa9_4-3114/home-design.jpg)
 2 | 
 3 | Art by:[Altered  Artichoke](https://alteredartichoke.com)
 4 | 
 5 | 
 6 | # Dictionary Crawler
 7 | This is a python code based on Scrapy package to crawl famous online dictionaries like Oxford, Longman, Cambridge, Webster, and Collins.
 8 | Be aware that this code is published on July 21, 2019, and if those sites will be updated in the future this code may not work properly.
 9 | ## Prerequisites
10 | Scrapy python package
11 | 
12 | ## How to use it?
13 | ##### 1- set words:
14 | at *dictionary_crawler/dictionary_crawler/dictionary_crawler/spiders* there is a **\__init__.py** and there is a list named **words** that is initialize like this:
15 | ```python
16 | words = ['I', 'hope', 'you', 'like', 'this', 'dictionary', 'web', 'crawler']
17 | ```
18 | you can change this list to crawl the words that you want.
19 | ##### 2- run:
20 | go to the file *dictionary_crawler/dictionary_crawler/dictionary_crawler/spiders* and run this in the terminal
21 | 
22 |     scrapy crawl [name of dictionary] -o [name of file to write into].jl
23 |     
24 | for example for Oxford dictionary:
25 | 
26 |     scrapy crawl oxford -o oxford.jl
27 | and the data will be stored in oxford.jl at *dictionary_crawler/dictionary_crawler/dictionary_crawler/spiders*
28 | 
29 | ## How is the output?
30 | The output is a JSON Lines file format that each line of it is a python dictionary with a word and definitions of it.
31 | for example, the word **hope** in the Webster dictionary is like this:
32 | ```json
33 | {"hope":
34 |     {
35 |     "verb": ["to cherish a desire with anticipation  to want something to happen or be true"],
36 |     "noun": ["to desire with expectation of obtainment or fulfillment", "to expect with confidence", "city in southwestern Arkansas that was the childhood home of President Bill Clinton population 10,095"],
37 |     "biographical name": ["desire accompanied by expectation of or belief in fulfillment", "expectation of fulfillment or success", "someone or something on which hopes are centered", "something desired or hoped (see  1) for", "Anthony", "Bob 1903–2003 originally Leslie Townes Hope American (British-born) comedian"],
38 |     "geographical name": ["Victor Alexander John 1887–1951 2nd Marquis of", "British soldier; viceroy of India (1936–43)"]
39 |     }
40 | }
41 | ```
42 | ## Author
43 | 
44 | * **Peyman Mohseni kiasari**
45 | 
46 | ## License
47 | 
48 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details.
49 | 


--------------------------------------------------------------------------------
/dictionary_crawler/dictionary_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kiasar/Dictionary_crawler/25d2b6203b64ee4d8a4b8fad83ebfe5d3fb615a0/dictionary_crawler/dictionary_crawler/__init__.py


--------------------------------------------------------------------------------
/dictionary_crawler/dictionary_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DictionaryCrawlerItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/dictionary_crawler/dictionary_crawler/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class DictionaryCrawlerSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class DictionaryCrawlerDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/dictionary_crawler/dictionary_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DictionaryCrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/dictionary_crawler/dictionary_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dictionary_crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dictionary_crawler'
13 | 
14 | SPIDER_MODULES = ['dictionary_crawler.spiders']
15 | NEWSPIDER_MODULE = 'dictionary_crawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dictionary_crawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | FEED_EXPORT_ENCODING = "utf-8"
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | CONCURRENT_REQUESTS = 512
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 0.0
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'dictionary_crawler.middlewares.DictionaryCrawlerSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'dictionary_crawler.middlewares.DictionaryCrawlerDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | #    'dictionary_crawler.pipelines.DictionaryCrawlerPipeline': 300,
70 | #}
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/dictionary_crawler/dictionary_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import scrapy
  4 | 
  5 | # by Peyman (mohsenikiasari@ce.sharif.edu) in 2019.
  6 | 
  7 | words = ['I', 'hope', 'you', 'like', 'this', 'dictionary', 'web', 'crawler']
  8 | 
  9 | #  scrapy crawl oxford -o oxford.jl
 10 | class OxfordCrawler(scrapy.Spider):
 11 |     name = "oxford"
 12 |     allowed_domains = ["www.lexico.com"]
 13 |     start_urls = ["https://www.lexico.com/en/definition/" + word for word in words]
 14 | 
 15 |     def parse(self, response):
 16 |         word = response.request.url.split("/")[-1]
 17 |         definition_dict = {}
 18 | 
 19 |         for sections in response.xpath("//section[@class='gramb']"):
 20 |             try:
 21 |                 part_of_speech = sections.xpath(".//span[@class='pos']/text()").extract()[0]
 22 |             except:
 23 |                 part_of_speech = False
 24 |             def_list = sections.xpath("./ul/li/div[@class='trg']//span[@class='ind']").extract()
 25 |             if not def_list:
 26 |                 def_list = sections.xpath(".//div[@class='empty_sense']//div[@class='crossReference']").extract()
 27 | 
 28 |             def_list = [re.sub(r'<.*?>', "", i).strip() for i in def_list]
 29 |             def_list = [i for i in def_list if i]
 30 | 
 31 |             if def_list and part_of_speech:
 32 |                 if part_of_speech in definition_dict:
 33 |                     definition_dict[part_of_speech] += def_list
 34 |                 else:
 35 |                     definition_dict[part_of_speech] = def_list
 36 | 
 37 |         if definition_dict:
 38 |             yield {word: definition_dict}
 39 | 
 40 | 
 41 | #  scrapy crawl longman -o longman.jl
 42 | class LongmanCrawler(scrapy.Spider):
 43 |     name = "longman"
 44 |     allowed_domains = ["https://www.ldoceonline.com"]
 45 |     start_urls = ["https://www.ldoceonline.com/dictionary/" + word for word in words]
 46 | 
 47 |     def parse(self, response):
 48 |         word = response.request.url.split("/")[-1]
 49 |         definition_dict = {}
 50 | 
 51 |         for sections in response.xpath("//span[@class='dictentry']"):
 52 |             try:
 53 |                 part_of_speech = (sections.xpath(".//span[@class='POS']/text()").extract()[0]).strip()
 54 |             except:
 55 |                 part_of_speech = False
 56 |             def_list = sections.xpath(".//span[@class='Sense']/span[@class='DEF']").extract()
 57 |             def_list = [re.sub(r'<.*?>', "", i[18:-7]).strip() for i in def_list]
 58 |             def_list = [i for i in def_list if i]
 59 | 
 60 |             if def_list and part_of_speech:
 61 |                 if part_of_speech in definition_dict:
 62 |                     definition_dict[part_of_speech] += def_list
 63 |                 else:
 64 |                     definition_dict[part_of_speech] = def_list
 65 | 
 66 |         if definition_dict:
 67 |             yield {word: definition_dict}
 68 | 
 69 | 
 70 | #  scrapy crawl cambridge -o cambridge.jl
 71 | class CambridgeCrawler(scrapy.Spider):
 72 |     name = "cambridge"
 73 |     allowed_domains = ["https://dictionary.cambridge.org"]
 74 |     start_urls = ["https://dictionary.cambridge.org/dictionary/english/" + word for word in words]
 75 | 
 76 |     def parse(self, response):
 77 |         word = response.request.url.split("/")[-1]
 78 |         definition_dict = {}
 79 | 
 80 |         for enrty in response.xpath("//div[@class='entry-body__el clrd js-share-holder']"):
 81 |             part_of_speeches = enrty.xpath("./div[@class='pos-header']//span[@class='pos']/text()").extract()
 82 |             def_list = enrty.xpath(
 83 |                 ".//div[@class='sense-body']/div[@class='def-block pad-indent']//b[@class='def']").extract()
 84 |             def_list = [re.sub(r'<.*?>|:', "", i[15:-4]).strip() for i in def_list]
 85 |             def_list = [i for i in def_list if i]
 86 | 
 87 |             if def_list and part_of_speech:
 88 |                 for part_of_speech in part_of_speeches:
 89 |                     if part_of_speech in definition_dict:
 90 |                         definition_dict[part_of_speech] += def_list
 91 |                     else:
 92 |                         definition_dict[part_of_speech] = def_list
 93 | 
 94 |         if definition_dict:
 95 |             yield {word: definition_dict}
 96 | 
 97 | 
 98 | #  scrapy crawl webster -o webster.jl
 99 | class WebsterCrawler(scrapy.Spider):
100 |     name = "webster"
101 |     allowed_domains = ["https://www.merriam-webster.com"]
102 |     start_urls = ["https://www.merriam-webster.com/dictionary/" + word for word in words]
103 | 
104 |     def parse(self, response):
105 |         word = response.request.url.split("/")[-1]
106 |         definition_dict = {}
107 | 
108 |         part_of_speeches = [re.sub(r'\(.*\)', "", i).strip() for i in
109 |                             response.xpath("//span[@class='fl']/a/text()|//span[@class='fl']/text()").extract()]
110 | 
111 |         for sections in response.xpath("//div[contains(@id, 'dictionary-entry')]/div[@class='vg']"):
112 |             part_of_speech = part_of_speeches.pop(0)
113 |             def_list = sections.xpath(
114 |                 ".//span[@class='dtText' or @class='unText'][not(ancestor::span[@class='dtText'])]").extract()
115 |             def_list = [re.sub(r'<span.*>.+</span>', "", i[21:-7]) for i in def_list]
116 |             def_list = [re.sub(r'<.*?>|:', "", i).strip() for i in def_list]
117 |             def_list = [i for i in def_list if i]
118 | 
119 |             if def_list and part_of_speech:
120 |                 if part_of_speech in definition_dict:
121 |                     definition_dict[part_of_speech] += def_list
122 |                 else:
123 |                     definition_dict[part_of_speech] = def_list
124 | 
125 |         if definition_dict:
126 |             yield {word: definition_dict}
127 | 
128 | 
129 | #  scrapy crawl collins -o collins.jl
130 | class CollinsCrawler(scrapy.Spider):
131 |     name = "collins"
132 |     allowed_domains = ["https://www.collinsdictionary.com"]
133 |     start_urls = ["https://www.collinsdictionary.com/dictionary/english/" + word for word in words]
134 | 
135 |     def parse(self, response):
136 |         word = response.request.url.split("/")[-1]
137 |         definition_dict = {}
138 | 
139 |         for sections in response.xpath("//div[@class='dictionary Cob_Adv_Brit']"
140 |                                        "//div[@class='content definitions cobuild br']/div[@class='hom']"):
141 |             try:
142 |                 part_of_speech = (sections.xpath(".//span[@class='pos']/text()").extract()[0]).strip()
143 |             except:
144 |                 part_of_speech = False
145 |             def_list = sections.xpath("./div[@class='sense']/div[@class='def']").extract()
146 |             def_list = [re.sub(r'<.*?>', "", i[17:-6]).strip() for i in def_list]
147 |             def_list = [i for i in def_list if i]
148 | 
149 |             if def_list and part_of_speech:
150 |                 if part_of_speech in definition_dict:
151 |                     definition_dict[part_of_speech] += def_list
152 |                 else:
153 |                     definition_dict[part_of_speech] = def_list
154 | 
155 |         if definition_dict:
156 |             yield {word: definition_dict}
157 | 


--------------------------------------------------------------------------------
/dictionary_crawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dictionary_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dictionary_crawler
12 | 


--------------------------------------------------------------------------------