", re.split('', r.text)[1])[0]
154 | # "Откусываем" оставшиеся теги.
155 | beaux_text=BeautifulSoup(text, "lxml")
156 | n_text = beaux_text.get_text()
157 | n_text = re.sub('\xa0', '', n_text)
158 | n_text = unify.unify_sym(n_text)
159 | return(n_text)
160 |
161 |
162 | # In[77]:
163 |
164 | #Теперь список авторов нам нужно превратить с список данных автора и ссылки на его тексты:
165 | def getAuthorInfo(authorlink):
166 | r = requests.get(authorlink, headers=headers)
167 |
168 | try:
169 |
170 | author_items = re.split("", re.split("Произведений: ", r.text)[1])[0]
171 | author_readeres = re.split("", re.split("Читателей: ", r.text)[1])[0]
172 | return author_items, author_readeres
173 |
174 | except:
175 | return '', ''
176 |
177 |
178 | # In[12]:
179 |
180 | WDIR = ensure_dir(r'/home/tsha/stihi_ru/texts')
181 |
182 |
183 | # In[ ]:
184 |
185 | for year in range(2005,2016)[::-1]:
186 | metatable_texts = open(ensure_dir(r'/home/tsha/stihi_ru/meta/'+str(year))+'/metatable_texts.txt', 'a', encoding='utf8')
187 | metatable_texts.write('textid\tURL\ttitle\tauthor\tauthorlink\tdate\ttime\tpath\tauthor_readers\tauthor_poems\ttopic\tgenre\n')
188 | #textid, poemlink, title, author, authorlink,date,time, path, author_readers,author_poems,topic,genre
189 | for month in range(1,13)[::-1]:
190 | if month < 10:
191 | month = "0" + str(month)
192 | path = ensure_dir(WDIR + "/"+str(year)+"/"+str(month))
193 | for day in range(1, 32)[::-1]:
194 | if day < 10:
195 | day = "0" + str(day)
196 | if year==2015 and int(month)==12 :
197 | pass
198 | elif year==2015 and int(month)==11 and int(day)>=11:
199 | pass
200 | else:
201 | for topic in rubrics:
202 | print(year, month, day,rubrics[topic] )
203 | link = make_daily_link(year, month, day, topic)
204 | text_info = get_poem_links_by_date(link)
205 |
206 | #вот здесь по-другому
207 | for i in tqdm(range(len(text_info))):
208 | textid = str(year)+str(month)+str(day)+str(i)+str(topic)
209 | textlink = text_info[i][0]
210 |
211 |
212 | try:
213 | text = getTextStihi(textlink)
214 | textfile = open(os.path.join(path, textid+'.txt'), 'w', encoding='utf8')
215 | textfile.write(text)
216 | textfile.close()
217 | author_poems, author_readers = getAuthorInfo(text_info[i][3])
218 | genre = genre_dic[rubrics[topic]]
219 | textfeats = [textid]+text_info[i] + [os.path.join(path, textid+'.txt'),author_poems, author_readers, topic, genre]
220 | metatable_texts.write("\t".join(textfeats)+'\n')
221 | except:
222 | continue
223 | print(textlink)
224 | metatable_texts.close()
225 |
226 |
227 | # In[ ]:
228 |
229 |
230 |
231 |
--------------------------------------------------------------------------------
/youtube_crawl_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/mi_air/.local/lib/python3.5/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.24.1) or chardet (2.3.0) doesn't match a supported version!\n",
13 | " RequestsDependencyWarning)\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "from __future__ import print_function\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "import time\n",
23 | "import json\n",
24 | "import requests\n",
25 | "import lxml.html\n",
26 | "import io\n",
27 | "from tqdm import tqdm\n",
28 | "from lxml.cssselect import CSSSelector\n",
29 | "\n",
30 | "#used to make the Browser Working\n",
31 | "from selenium import webdriver\n",
32 | "#Send keycodes to Elements\n",
33 | "from selenium.webdriver.common.keys import Keys\n",
34 | "#scrape the url's and comments\n",
35 | "from bs4 import BeautifulSoup\n",
36 | "\n",
37 | "import re\n",
38 | "import datetime\n",
39 | "import time\n",
40 | "import codecs\n",
41 | "\n",
42 | "\n"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 5,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "\n",
54 | "\n",
55 | "YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'\n",
56 | "YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'\n",
57 | "\n",
58 | "USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'\n"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "'2019-04-10'"
70 | ]
71 | },
72 | "execution_count": 3,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "str(datetime.date.today())"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "{'/watch?v=ew-8i7UUpLY', '/watch?v=oOxDLuTXyCo', '/watch?v=32Rj7dv2IRE', '/watch?v=RtSS0SJE8oE', '/watch?v=N0CGM956z18', '/watch?v=fhCLQNYowcE', '/watch?v=AbXahmBSLTk', '/watch?v=ocekebVtZvw', '/watch?v=JM-GwDh73Wc', '/watch?v=BLHrjzTEr0c', '/watch?v=H6Kl8kheGBg', '/watch?v=cnn-z4U_S50', '/watch?v=F1B9Fk_SgI0', '/watch?v=qr4AaWAkf34', '/watch?v=knBhDpMXsQo', '/watch?v=DEfgiRorfbM', '/watch?v=DVH0CzurtIE', '/watch?v=p1JPKLa-Ofc', '/watch?v=nhcDl3S5sXQ', '/watch?v=TcMBFSGVi1c', '/watch?v=b52lho8lD6Q', '/watch?v=kPg3M4C9N9w', '/watch?v=XlmaJ-yU46U', '/watch?v=U_90XNCBatY', '/watch?v=6pdfriFuFt8', '/watch?v=vw2SaHkGfss', '/watch?v=yzZIS1TtXjw', '/watch?v=q97nD5dOS5M', '/watch?v=31OnT5iSLA0', '/watch?v=3y-O-4IL-PU', '/watch?v=RzfO1FbUCo8', '/watch?v=IRKwwk7CXBQ', '/watch?v=ZD9OkKE0TfA', '/watch?v=35adpxPiNlU', '/watch?v=4yXU8K-9SIw', '/watch?v=nDq6TstdEi8', '/watch?v=3XNDaISqFX8', '/watch?v=3p1fHBNILhM', '/watch?v=W1j28DRcFBQ', '/watch?v=qywZ6lUcNo8', '/watch?v=_XFzT9GMmw8', '/watch?v=hE2Ira-Cwxo', '/watch?v=A8N4_cjLXH8', '/watch?v=kvvLXVDYl6I', '/watch?v=vEUlnLOQG8k', '/watch?v=ssbNmaOmVMk', '/watch?v=XW_KhFq4LQo', '/watch?v=CaBq3SvO0a4', '/watch?v=6Z6zfRWTotY', '/watch?v=3t195yz9xCc', '/watch?v=YbiKtZSqmB4', '/watch?v=wzjWIxXBs_s', '/watch?v=4D8ezH0iXh8', '/watch?v=cyzqxRHLPpk', '/watch?v=CiL-yTNa6QY', '/watch?v=eKmRkS1os7k', '/watch?v=ufI6DCB6X2U', '/watch?v=c18WvLeJn-I', '/watch?v=f30Jq8BQPQo', '/watch?v=7gvqArR7nlA', '/watch?v=el00pNoRB34', '/watch?v=Ba44js56nF4', '/watch?v=gporsZ8WnsM', '/watch?v=P2qOZDuiYlM', '/watch?v=x865r5EqKDo', '/watch?v=PHgc8Q6qTjc', '/watch?v=TwFvvcHf7Dw', '/watch?v=_nf8GV0AvtI', '/watch?v=AKzFFJXMDyE', '/watch?v=iP0MrLN4xso', '/watch?v=o39KwSswsgw', '/watch?v=iloh1SUe42g', '/watch?v=2apVwq-pX9E', '/watch?v=u4x9YyRnFDE', '/watch?v=hsGOT_0L16U', '/watch?v=kO9bzwqCNgo', '/watch?v=IRUihzQvBMo', '/watch?v=zUyH3XhpLTo', '/watch?v=DFia7FhVmuM', '/watch?v=buCD-_1UPn4', '/watch?v=mFlrc16xjik', '/watch?v=qcGNoZ3r9t8', '/watch?v=XmAsgB4EMR8', '/watch?v=KCSNFZKbhZE', '/watch?v=nMfPqeZjc2c', '/watch?v=eEd2K1FxNQY', '/watch?v=3NycM9lYdRI', '/watch?v=gmU9PBDS-0k', '/watch?v=3fEdoqHCaM8', '/watch?v=jCC8fPQOaxU', '/watch?v=VUArb3AIpm4', '/watch?v=z6buCeA4ZSc', '/watch?v=K3Qzzggn--s', '/watch?v=jwxI0OX3GsA', '/watch?v=xcg_e-FY_Vs', '/watch?v=nvRjW2oYBiU', '/watch?v=b2AcxL88DoI', '/watch?v=ZeTWW47yhC4', '/watch?v=sxt4YCIsn2I', '/watch?v=S1gp0m4B5p8', '/watch?v=laoUmXqscdk', '/watch?v=hC8CH0Z3L54', '/watch?v=eQHo2zo58no', '/watch?v=lZIq7A9zKFs', '/watch?v=C5Gm8UvxKlU', '/watch?v=2KBFD0aoZy8', '/watch?v=JKeG1iJNxGs', '/watch?v=OdV6SkGZb3g', '/watch?v=g9bzrGBzSC4', '/watch?v=b5W9t62t10I', '/watch?v=66Ki5_-E0n4', '/watch?v=yMRoNNKWuqQ', '/watch?v=4cx9apL7HhY', '/watch?v=9DzSGPad_z4', '/watch?v=4H9jTQKmR3Q', '/watch?v=4zI6guqVqiI', '/watch?v=sCD9zjf_YRU', '/watch?v=jfjfzKf85Ac', '/watch?v=Yxnsxg4rs0E', '/watch?v=lFcSrYw-ARY', '/watch?v=1ZYbU82GVz4', '/watch?v=1nnRC6jDOCI', '/watch?v=7Jj83FOlBF8', '/watch?v=L5cLq1mIC70', '/watch?v=4NcoqtHH2IE', '/watch?v=jJys1BM8x8k', '/watch?v=tF0uHeLy1v0', '/watch?v=gXKPjSkCSMM', '/watch?v=9uIk_91GQYI', '/watch?v=njHvGxZgTPk', '/watch?v=l8kLiUZDbQ4', '/watch?v=t433PEQGErc', '/watch?v=GRTS9yZJREk', '/watch?v=emKhAptPqg4', '/watch?v=Z-0FXUgVsVs', '/watch?v=CX17qmYO0o0', '/watch?v=WzfRhSU9_qA', '/watch?v=kHkKihbfsXQ', '/watch?v=GMFewiplIbw', '/watch?v=xpVfcZ0ZcFM', '/watch?v=x4o5g_PGkiA', '/watch?v=qvzW_CJTlmM', '/watch?v=VfNvJs7-RM4', '/watch?v=LS_-ZMcGnow', '/watch?v=r34Isj_erU4', '/watch?v=zXtsGAkyeIo', '/watch?v=tKMmMHyLBCE', '/watch?v=0fUMyQlzujU', '/watch?v=4IrkawvzGE8', '/watch?v=k4YRWT_Aldo', '/watch?v=-UOMvxh4MYU', '/watch?v=NwSIgDKvMHk', '/watch?v=au2n7VVGv_c', '/watch?v=HmH4W8JOifg', '/watch?v=IV6IuCTg6MU', '/watch?v=8xQrvclhJrU', '/watch?v=RbMqcFvtMN8', '/watch?v=5nxD4PY39xw', '/watch?v=jC7eeYwKrg0', '/watch?v=txQ6t4yPIM0', '/watch?v=WPni755-Krg', '/watch?v=NymS69shfkc', '/watch?v=_YuMfMLC8FA', '/watch?v=jHcbLgNQ4Co', '/watch?v=wi0q0y7U75c', '/watch?v=JeTzND6XrB0', '/watch?v=r8EF3X8EI2o', '/watch?v=s2Gw6r6HooA', '/watch?v=9QbltzIUV6w', '/watch?v=B8yo1HPW2O4', '/watch?v=JnfP9qKAbk8', '/watch?v=o7W7OvETO40', '/watch?v=vjjS92Q0lYs', '/watch?v=mzs7lmETE90', '/watch?v=qO2Y6BHYhHw', '/watch?v=HO2AJneTjAM', '/watch?v=GFEcOvs6YWk', '/watch?v=rTKodwXQi78', '/watch?v=57p6K-5ZSNc', '/watch?v=-twm7ldMOtI', '/watch?v=nvm6RzVLjWo', '/watch?v=J3UXp9jIr-U', '/watch?v=Cfd6PknS0Fw', '/watch?v=waU75jdUnYw', '/watch?v=ERUugjLmwuY', '/watch?v=MikD7plCDQg', '/watch?v=zxeTC0wKPXs', '/watch?v=OjWsugnahJ0', '/watch?v=zS-Og_RfdNc', '/watch?v=eDuRoPIOBjE', '/watch?v=23e9_o5rxsA', '/watch?v=STZZso9GUhA', '/watch?v=nEDKVNoE2ws', '/watch?v=m8UQ4O7UiDs', '/watch?v=EIeUJcP3T0Q', '/watch?v=7ysFgElQtjI', '/watch?v=jsHX1cFL41w', '/watch?v=qOXXvttM-e8', '/watch?v=9z1nTwP2n0w', '/watch?v=LH4Y1ZUUx2g', '/watch?v=0HpYEZ86Wuc', '/watch?v=2sap-GTtCiU', '/watch?v=8Ap7aJsfaXQ', '/watch?v=IPPYI64aHno', '/watch?v=ygyz3Mqjh0k', '/watch?v=-QKP4iVCaiY', '/watch?v=-bmA1D00B4o', '/watch?v=01ouUdAEFdU', '/watch?v=wWdXfX4Vpm8'}\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "\n",
96 | "# The List where the links to the videos are stored\n",
97 | "links = set()\n",
98 | "\n",
99 | "comments = list()\n",
100 | "\n",
101 | "homePage = 'https:www.youtube.com'\n",
102 | "linksSize = 10\n",
103 | "driver = webdriver.Firefox()\n",
104 | "\n",
105 | "output = open(\"/media/mi_air/0F0B7DDE62EEA81E/youtube/\"+str(datetime.date.today())+\".txt\",\"w\")\n",
106 | "\n",
107 | "def loadFullPage(Timeout):\n",
108 | " reachedbottom = None\n",
109 | " while not reachedbottom:\n",
110 | " #scroll one pane down\n",
111 | " driver.execute_script(\"window.scrollTo(0,Math.max(document.documentElement.scrollHeight,document.body.scrollHeight,document.documentElement.clientHeight));\");\n",
112 | " time.sleep(Timeout)\n",
113 | " #check if the bottom is reached\n",
114 | " a = driver.execute_script(\"return document.documentElement.scrollTop;\")\n",
115 | " b = driver.execute_script(\"return document.documentElement.scrollHeight - document.documentElement.clientHeight;\")\n",
116 | " relativeHeight = a / b\n",
117 | " if(relativeHeight==1):\n",
118 | " reachedbottom = True\n",
119 | "def getComments(link):\n",
120 | " driver.get(url='https:youtube.com'+link)\n",
121 | " loadFullPage(1)\n",
122 | "\n",
123 | "\n",
124 | "def main():\n",
125 | " driver.get(url=homePage)\n",
126 | " enoughLinks = None\n",
127 | "\n",
128 | " while not enoughLinks:\n",
129 | " loadFullPage(1)\n",
130 | "\n",
131 | " soup = BeautifulSoup(driver.page_source, 'html.parser')\n",
132 | "\n",
133 | " for link in soup.find_all(\"a\",class_=\"yt-simple-endpoint style-scope ytd-grid-video-renderer\", href=True):\n",
134 | " if link not in links:\n",
135 | " links.add(link['href'])\n",
136 | "\n",
137 | " if len(links) < linksSize:\n",
138 | " driver.refresh()\n",
139 | " else:\n",
140 | " #for i in range(len(links)-1000):\n",
141 | " #links.pop()\n",
142 | " enoughLinks = True\n",
143 | "\n",
144 | " #links.sort()\n",
145 | " for link in links:\n",
146 | " output.write(link)\n",
147 | " output.write(\"\\n\")\n",
148 | " output.close()\n",
149 | " print(links)\n",
150 | "\n",
151 | "\n",
152 | "if __name__ == '__main__':\n",
153 | " main()"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "collapsed": true
161 | },
162 | "outputs": [],
163 | "source": [
164 | "def find_value(html, key, num_chars=2):\n",
165 | " pos_begin = html.find(key) + len(key) + num_chars\n",
166 | " pos_end = html.find('\"', pos_begin)\n",
167 | " return html[pos_begin: pos_end]\n",
168 | "\n",
169 | "\n",
170 | "def extract_comments(html):\n",
171 | " tree = lxml.html.fromstring(html)\n",
172 | " item_sel = CSSSelector('.comment-item')\n",
173 | " text_sel = CSSSelector('.comment-text-content')\n",
174 | " time_sel = CSSSelector('.time')\n",
175 | " author_sel = CSSSelector('.user-name')\n",
176 | "\n",
177 | " for item in item_sel(tree):\n",
178 | " yield {'cid': item.get('data-cid'),\n",
179 | " 'text': text_sel(item)[0].text_content(),\n",
180 | " 'time': time_sel(item)[0].text_content().strip(),\n",
181 | " 'author': author_sel(item)[0].text_content()}\n",
182 | "\n",
183 | "\n",
184 | "def extract_reply_cids(html):\n",
185 | " tree = lxml.html.fromstring(html)\n",
186 | " sel = CSSSelector('.comment-replies-header > .load-comments')\n",
187 | " return [i.get('data-cid') for i in sel(tree)]\n",
188 | "\n",
189 | "\n",
190 | "def ajax_request(session, url, params, data, retries=10, sleep=20):\n",
191 | " for _ in range(retries):\n",
192 | " response = session.post(url, params=params, data=data)\n",
193 | " if response.status_code == 200:\n",
194 | " response_dict = json.loads(response.text)\n",
195 | " return response_dict.get('page_token', None), response_dict['html_content']\n",
196 | " else:\n",
197 | " time.sleep(sleep)\n",
198 | "\n",
199 | "\n",
200 | "def download_comments(youtube_id, sleep=1):\n",
201 | " session = requests.Session()\n",
202 | " session.headers['User-Agent'] = USER_AGENT\n",
203 | "\n",
204 | " # Get Youtube page with initial comments\n",
205 | " response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))\n",
206 | " html = response.text\n",
207 | " reply_cids = extract_reply_cids(html)\n",
208 | "\n",
209 | " ret_cids = []\n",
210 | " for comment in extract_comments(html):\n",
211 | " ret_cids.append(comment['cid'])\n",
212 | " yield comment\n",
213 | "\n",
214 | " page_token = find_value(html, 'data-token')\n",
215 | " session_token = find_value(html, 'XSRF_TOKEN', 4)\n",
216 | "\n",
217 | " first_iteration = True\n",
218 | "\n",
219 | " # Get remaining comments (the same as pressing the 'Show more' button)\n",
220 | " while page_token:\n",
221 | " data = {'video_id': youtube_id,\n",
222 | " 'session_token': session_token}\n",
223 | "\n",
224 | " params = {'action_load_comments': 1,\n",
225 | " 'order_by_time': True,\n",
226 | " 'filter': youtube_id}\n",
227 | "\n",
228 | " if first_iteration:\n",
229 | " params['order_menu'] = True\n",
230 | " else:\n",
231 | " data['page_token'] = page_token\n",
232 | "\n",
233 | " response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)\n",
234 | " if not response:\n",
235 | " break\n",
236 | "\n",
237 | " page_token, html = response\n",
238 | "\n",
239 | " reply_cids += extract_reply_cids(html)\n",
240 | " for comment in extract_comments(html):\n",
241 | " if comment['cid'] not in ret_cids:\n",
242 | " ret_cids.append(comment['cid'])\n",
243 | " yield comment\n",
244 | "\n",
245 | " first_iteration = False\n",
246 | " time.sleep(sleep)\n",
247 | "\n",
248 | " # Get replies (the same as pressing the 'View all X replies' link)\n",
249 | " for cid in reply_cids:\n",
250 | " data = {'comment_id': cid,\n",
251 | " 'video_id': youtube_id,\n",
252 | " 'can_reply': 1,\n",
253 | " 'session_token': session_token}\n",
254 | "\n",
255 | " params = {'action_load_replies': 1,\n",
256 | " 'order_by_time': True,\n",
257 | " 'filter': youtube_id,\n",
258 | " 'tab': 'inbox'}\n",
259 | "\n",
260 | " response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)\n",
261 | " if not response:\n",
262 | " break\n",
263 | "\n",
264 | " _, html = response\n",
265 | "\n",
266 | " for comment in extract_comments(html):\n",
267 | " if comment['cid'] not in ret_cids:\n",
268 | " ret_cids.append(comment['cid'])\n",
269 | " yield comment\n",
270 | " time.sleep(sleep)\n",
271 | "\n",
272 | "\n",
273 | "def main(youtube_id, output, limit=100):\n",
274 | "\n",
275 | " try:\n",
276 | "\n",
277 | " if not youtube_id or not output:\n",
278 | " parser.print_usage()\n",
279 | " raise ValueError('you need to specify a Youtube ID and an output filename')\n",
280 | "\n",
281 | " print('Downloading Youtube comments for video:', youtube_id)\n",
282 | " count = 0\n",
283 | " with io.open(output, 'w', encoding='utf8') as fp:\n",
284 | " for comment in download_comments(youtube_id):\n",
285 | " sys.stdout.write(json.dumps(comment, ensure_ascii=False))\n",
286 | " count += 1\n",
287 | " sys.stdout.write('Downloaded %d comment(s)\\r' % count)\n",
288 | " sys.stdout.flush()\n",
289 | " if limit and count >= limit:\n",
290 | " break\n",
291 | " print('\\nDone!')\n",
292 | "\n",
293 | "\n",
294 | " except Exception as e:\n",
295 | " print('Error:', str(e))\n",
296 | " sys.exit(1)\n",
297 | "\n"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [
305 | {
306 | "name": "stderr",
307 | "output_type": "stream",
308 | "text": [
309 | "\n",
310 | " 0%| | 0/10 [00:00, ?it/s]\u001b[A"
311 | ]
312 | },
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "Downloading Youtube comments for video: ew-8i7UUpLY\n",
318 | "{\"author\": \"B Solo\", \"cid\": \"UgyaulXNutDMC8e8br54AaABAg\", \"time\": \"1 hour ago\", \"text\": \"Draymond Green looks like he just escaped slavery!\"}Downloaded 20 comment(s) appearances than a guy like Bradley Beal speaks to why people think Draymond is overrated. He would not make the all star team on any other team in the league.\"}Downloaded 19 comment(s)es only one ball\\\" crap. hes better then 90 % of yalls PFs, even as a \\\"role player\\\". fuck off\"}Downloaded 16 comment(s)\r"
319 | ]
320 | },
321 | {
322 | "name": "stderr",
323 | "output_type": "stream",
324 | "text": [
325 | "\n"
326 | ]
327 | },
328 | {
329 | "name": "stdout",
330 | "output_type": "stream",
331 | "text": [
332 | "{\"author\": \"Infinit 0\", \"cid\": \"Ugz1AnGIkvGzByb2GJR4AaABAg\", \"time\": \"5 hours ago\", \"text\": \"Draymond needs to accept that he is hostile first off, and work on his craft. He could be a helluva rebounder, like Rodman with his attitude, but he thinks to highly of himself and he isn't putting up double digit scoring numbers. He should reevaluate and change for the better before his time runs out.\"}Downloaded 100 comment(s)n and on. Jalen makes a good point that he rose out of nothing and was a nobody coming into the league. But it's also true that he's become overrated playing on the Warriors.\"}Downloaded 92 comment(s)nd PG13 made a contested shot, but hey, it's still counted as assist. If he can average 5 blocks and 5 steal, then even he got no triple double he will be most underrated. But no, the players got this right, he can't even shoot FT. Oh btw, he's the 2nd in TO per game.\"}Downloaded 47 comment(s)\r"
333 | ]
334 | },
335 | {
336 | "name": "stderr",
337 | "output_type": "stream",
338 | "text": [
339 | "\r",
340 | " 10%|█ | 1/10 [00:07<01:11, 7.99s/it]"
341 | ]
342 | },
343 | {
344 | "name": "stdout",
345 | "output_type": "stream",
346 | "text": [
347 | "\n",
348 | "Done!\n",
349 | "Downloading Youtube comments for video: oOxDLuTXyCo\n",
350 | "{\"author\": \"SERBIA SRB\", \"cid\": \"UgwA19RnoZDvmDCdXpt4AaABAg\", \"time\": \"2 days ago\", \"text\": \"VIVA BARÇA 💪🏻💪🏻💪🏻\"}Downloaded 100 comment(s)day.\"}Downloaded 99 comment(s)er dribbling, better short and medium pass, strrt faster\\nRonaldo plays better with his head, he has a stronger blow, he runs faster for medium and long\"}Downloaded 97 comment(s)fensive game\\nAlso, Argentina needs 2-3 tall and powerful players in order not to yield to high and powerful teams like Germany and France.\"}Downloaded 96 comment(s) a stumbling block for anyone.\\\" While I did understood the meaning of it, I was not able to grasp the full width of those words. You see, what this young man was telling me was that he wouldn't want to play a song that would provoke myself to defile my own conscience with old sinful memories, because if he did, then he would become a stumbling block for me. Very truthful indeed. How much more then, is your conscience defiled when you see nudity, death, violence, and sinful acts or when we hear cursing, lies, rumors, gossip, and hatred? It's true, each and everyone of us is going to give an account for how we lived in this life, and we will be judged according to our works. Are you a stumbling block for others, or do guard your eyes and ears and mouth? \\\"But I say unto you, That every idle word that men shall speak, they shall give account thereof in the day of judgment\\\" (Matthew 12:36)\"}Downloaded 88 comment(s) en nuestro lugar. Jesús se convirtió en nuestro sustituto. \\n\\nEn la Biblia, un carcelero preguntó a sus prisioneros Pablo y Silas: “Señores, ¿qué tengo que hacer para ser salvo? —Cree en el Señor Jesús; así tú y tu familia serán salvos —le contestaron. \\n\\nLa Biblia es clara, cree en Jesús como aquel que cargó tus pecados, murió en tu lugar, fue enterrado y luego resucitado por Dios. Es la sangre de Cristo y la resurrección que nos aseguran la vida eterna cuando lo llamamos nuestro Señor y Salvador. “Porque todo el que invoque el nombre del Señor será salvo” (Romanos 10:13). “Todo el que” incluye a todos y cada uno de nosotros. \\n\\nPor lo tanto, si tú entiendes que eres un pecador y crees que Jesucristo vino como el único Redentor del pecado, entonces entiendes el plan de salvación. La pregunta es: ¿Estás listo para implementar el plan, recibiendo el regalo de Dios, Jesucristo? Si es así, cree en Jesucristo, arrepiéntete de tus pecados y entrégale el resto de tu vida a él como tu Señor:\\n\\n“Padre, sé que he quebrantado tus leyes y que mis pecados me han separado de ti. Estoy sinceramente arrepentido y ahora quiero apartarme de mi pasado pecaminoso y dirigirme hacia ti. Por favor, perdóname y ayúdame a no pecar de nuevo. Creo que tu hijo Jesucristo murió por mis pecados, resucitó de la muerte, está vivo y escucha mi oración. Invito a Jesús a que se convierta en el Señor de mi vida, a que gobierne y reine en mi corazón de este día en adelante. Por favor, envía tu Espíritu Santo para que me ayude a obedecerte y a hacer tu voluntad por el resto de mi vida. En el nombre de Jesús oro, amén.”“Arrepiéntanse y bautícese cada uno de ustedes en el nombre de Jesucristo para perdón de sus pecados --les contestó Pedro--, y recibirán el don del Espíritu Santo” (Hechos 2:38). \\n\\nSi decidiste convertirte en cristiano el día de hoy, bienvenido a la familia de Dios. Ahora, como una forma de crecer más cerca de él, la Biblia nos dice que sigamos adelante con nuestro compromiso\\n\\nBautízate como lo ordenó Cristo.\\n\\nDile a otra persona de tu nueva fe en Cristo.\\n\\nPasa tiempo con Dios cada día. No tiene que ser un largo periodo de tiempo. Simplemente desarrolla el hábito diario de orar y leer su Palabra. Pídele a Dios que incremente tu fe y tu comprensión de la Biblia.\\n\\nBusca estar en comunión con otros cristianos. Sé parte de un grupo de amigos Cristianos que responda tus inquietudes y te apoye.\\n\\nEncuentra una iglesia local en la que puedas adorar a Dios.\"}Downloaded 71 comment(s)\r"
351 | ]
352 | },
353 | {
354 | "name": "stderr",
355 | "output_type": "stream",
356 | "text": [
357 | "\r",
358 | " 20%|██ | 2/10 [00:16<01:05, 8.13s/it]"
359 | ]
360 | },
361 | {
362 | "name": "stdout",
363 | "output_type": "stream",
364 | "text": [
365 | "\n",
366 | "Done!\n",
367 | "Downloading Youtube comments for video: 32Rj7dv2IRE\n",
368 | "{\"author\": \"MSKANE302\", \"cid\": \"UgzAAxsi0mkbhqQ_3ld4AaABAg\", \"time\": \"1 week ago\", \"text\": \"19:14\\nMightyDuck:Ahhhhh\\nMom:Stop \\nMightyDuck:Believe I can fly\\nMom:Boy shut yo face.Shut yo face\\nMightyDuck:(trynna \\\"shut his face\\\") Um,I can't\\n😂🤣😂🤣😂🤣😂🤣😂🤣😂🤣😂🤣😂🤣😂🤣\"}Downloaded 100 comment(s)sing🏃 you all over .\"}Downloaded 95 comment(s)\r"
369 | ]
370 | },
371 | {
372 | "name": "stderr",
373 | "output_type": "stream",
374 | "text": [
375 | "\r",
376 | " 30%|███ | 3/10 [00:24<00:57, 8.15s/it]"
377 | ]
378 | },
379 | {
380 | "name": "stdout",
381 | "output_type": "stream",
382 | "text": [
383 | "\n",
384 | "Done!\n",
385 | "Downloading Youtube comments for video: RtSS0SJE8oE\n",
386 | "{\"author\": \"Lonewolf3D2\", \"cid\": \"UgyCblUz1kLybjkK3rd4AaABAg\", \"time\": \"7 hours ago\", \"text\": \"2516?\"}Downloaded 20 comment(s)on my cave painting deluxe\"}Downloaded 19 comment(s)All about space\"}Downloaded 14 comment(s)omment(s)\r"
387 | ]
388 | }
389 | ],
390 | "source": [
391 | "wdir = r'/media/mi_air/0F0B7DDE62EEA81E/youtube/comments'\n",
392 | "\n",
393 | "uids = open(r'/media/mi_air/0F0B7DDE62EEA81E/youtube/'+str(datetime.date.today())+'.txt', 'r', encoding='utf8').readlines()\n",
394 | "uids = [i.strip('/watch?v=') for i in uids]\n",
395 | "for i in tqdm(uids[:10]):\n",
396 | " uid = i.strip('\\n')\n",
397 | " output = os.path.join(wdir, uid+'_'+str(datetime.date.today())+'.txt')\n",
398 | " main(uid, output)"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {
405 | "collapsed": true
406 | },
407 | "outputs": [],
408 | "source": []
409 | }
410 | ],
411 | "metadata": {
412 | "anaconda-cloud": {},
413 | "kernelspec": {
414 | "display_name": "Python [default]",
415 | "language": "python",
416 | "name": "python3"
417 | },
418 | "language_info": {
419 | "codemirror_mode": {
420 | "name": "ipython",
421 | "version": 3
422 | },
423 | "file_extension": ".py",
424 | "mimetype": "text/x-python",
425 | "name": "python",
426 | "nbconvert_exporter": "python",
427 | "pygments_lexer": "ipython3",
428 | "version": "3.5.2"
429 | }
430 | },
431 | "nbformat": 4,
432 | "nbformat_minor": 2
433 | }
434 |
--------------------------------------------------------------------------------
/crawler_vecher_moskva.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Краулер для \"Вечерней Москвы\""
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "# Импортируем необходимые библиотеки:\n",
19 | "import requests # http-запросы,\n",
20 | "import re # регулярные выражения,\n",
21 | "from bs4 import BeautifulSoup # удаление тегов html,\n",
22 | "from tqdm import tqdm # красотуля для анализа прогресса.\n",
23 | "import time\n",
24 | "import random"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {
31 | "collapsed": true
32 | },
33 | "outputs": [],
34 | "source": [
35 | "user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18'\n",
36 | "headers = { 'User-Agent' : user_agent }\n"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 7,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "\n",
48 | "hreffile=open(r\"/home/mi_air/Downloads/VM/href_list_dedup.txt\", \"w\", encoding=\"utf-8\")\n"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 12,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "ename": "ConnectionError",
60 | "evalue": "HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))",
61 | "output_type": "error",
62 | "traceback": [
63 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
64 | "\u001b[0;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)",
65 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 137\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 138\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
66 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
67 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
68 | "\u001b[0;31mConnectionRefusedError\u001b[0m: [Errno 111] Connection refused",
69 | "\nDuring handling of the above exception, another exception occurred:\n",
70 | "\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)",
71 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
72 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 350\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 351\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
73 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 834\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 835\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
74 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
75 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 146\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 147\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
76 | "\u001b[0;31mNewConnectionError\u001b[0m: : Failed to establish a new connection: [Errno 111] Connection refused",
77 | "\nDuring handling of the above exception, another exception occurred:\n",
78 | "\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)",
79 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 423\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 424\u001b[0m )\n",
80 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 642\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 643\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 644\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
81 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
82 | "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))",
83 | "\nDuring handling of the above exception, another exception occurred:\n",
84 | "\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)",
85 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mlink\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"https://vm.ru/news/2017/02/03/\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgetHrefs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
86 | "\u001b[0;32m\u001b[0m in \u001b[0;36mgetHrefs\u001b[0;34m(link)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# Получаем текст страницы, которая содержит ссылки на все статьи этого дня (в примере - 03.02.2017).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mendpage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'
'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#листаем все новости дня\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mendpage\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
87 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
88 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
89 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 486\u001b[0m }\n\u001b[1;32m 487\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 488\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 489\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
90 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 609\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 611\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
91 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
92 | "\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "link = \"https://vm.ru/news/2017/02/03/\"\n",
98 | "print(getHrefs(link))"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 11,
104 | "metadata": {
105 | "collapsed": true
106 | },
107 | "outputs": [],
108 | "source": [
109 | "def getHrefs(link):\n",
110 | "\n",
111 | " # Получаем текст страницы, которая содержит ссылки на все статьи этого дня (в примере - 03.02.2017).\n",
112 | " r = requests.get(link,headers=headers)\n",
113 | " endpage = int(re.split('
', re.split('', r.text)[1])[0]) #листаем все новости дня\n",
114 | " for i in range(2,endpage+1):\n",
115 | " newlink = link+'?page=' + str(i)\n",
116 | " # Каждая ссылка на статью оформлена с помощью тега
\n",
117 | " refs=re.split('', re.split('', r.text)[1])[0])\n",
118 | " for i in refs:\n",
119 | " if i.startswith(\"/news/\"):\n",
120 | " ilink = re.split('\">',i)[0]\n",
121 | " print(ilink)\n",
122 | " hreffile.write(\"https://vm.ru/\"+ilink+ \"\\n\")\n",
123 | " time.sleep(random.uniform(1,2))\n",
124 | " \n"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 16,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [
134 | {
135 | "ename": "ConnectionError",
136 | "evalue": "HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))",
137 | "output_type": "error",
138 | "traceback": [
139 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
140 | "\u001b[0;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)",
141 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 137\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 138\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
142 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
143 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
144 | "\u001b[0;31mConnectionRefusedError\u001b[0m: [Errno 111] Connection refused",
145 | "\nDuring handling of the above exception, another exception occurred:\n",
146 | "\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)",
147 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
148 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 350\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 351\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
149 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 834\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 835\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
150 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
151 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 146\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 147\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
152 | "\u001b[0;31mNewConnectionError\u001b[0m: : Failed to establish a new connection: [Errno 111] Connection refused",
153 | "\nDuring handling of the above exception, another exception occurred:\n",
154 | "\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)",
155 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 423\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 424\u001b[0m )\n",
156 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 642\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 643\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 644\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
157 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
158 | "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))",
159 | "\nDuring handling of the above exception, another exception occurred:\n",
160 | "\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)",
161 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madapters\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFAULT_RETRIES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"https://vm.ru/news/2017/02/03/\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mendpage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'
'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#листаем все новости дня\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mendpage\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mnewlink\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlink\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'?page='\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
162 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
163 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
164 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 486\u001b[0m }\n\u001b[1;32m 487\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 488\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 489\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
165 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 609\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 611\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
166 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
167 | "\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "requests.adapters.DEFAULT_RETRIES = 5\n",
173 | "r = requests.get(\"https://vm.ru/news/2017/02/03/\",headers=headers)\n",
174 | "endpage = int(re.split('
', re.split('', r.text)[1])[0]) #листаем все новости дня\n",
175 | "for i in range(2,endpage+1):\n",
176 | " newlink = link+'?page=' + str(i)\n",
177 | "\n",
178 | "refs=re.split('
', re.split('', r.text)[1])[0])\n",
179 | "for i in refs:\n",
180 | " if i.startswith(\"/news/\"):\n",
181 | " ilink = re.split('\">',i)[0]\n",
182 | " print(ilink)\n",
183 | " hreffile.write(\"https://vm.ru/\"+ilink+ \"\\n\")\n",
184 | " "
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "collapsed": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "\n",
196 | "for year in range(2010,2018):\n",
197 | " for month in range (1,13):\n",
198 | " if month <10:\n",
199 | " month = \"0\" + str(month)\n",
200 | " for day in range(1,32):\n",
201 | " if day<10:\n",
202 | " day = \"0\" + str(day)\n",
203 | " newlink = \"https://vm.ru/news/\" + str(year) +\"/\"+str(month)+\"/\"+str(day)+\"/\"\n",
204 | " print(newlink)\n",
205 | " try:\n",
206 | " getHrefs(newlink)\n",
207 | " \n",
208 | " except:\n",
209 | " pass\n"
210 | ]
211 | }
212 | ],
213 | "metadata": {
214 | "anaconda-cloud": {},
215 | "kernelspec": {
216 | "display_name": "Python [default]",
217 | "language": "python",
218 | "name": "python3"
219 | },
220 | "language_info": {
221 | "codemirror_mode": {
222 | "name": "ipython",
223 | "version": 3
224 | },
225 | "file_extension": ".py",
226 | "mimetype": "text/x-python",
227 | "name": "python",
228 | "nbconvert_exporter": "python",
229 | "pygments_lexer": "ipython3",
230 | "version": "3.5.2"
231 | }
232 | },
233 | "nbformat": 4,
234 | "nbformat_minor": 2
235 | }
236 |
--------------------------------------------------------------------------------
/иа_панорама.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#https://panorama.pub/category/news/page/83\n",
12 | "#https://panorama.pub/category/stati/page/3"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "# Импортируем необходимые библиотеки:\n",
24 | "import requests # http-запросы,\n",
25 | "import re # регулярные выражения,\n",
26 | "from bs4 import BeautifulSoup # удаление тегов html,\n",
27 | "import time\n",
28 | "import random\n",
29 | "import os\n",
30 | "import tqdm"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {
37 | "collapsed": true
38 | },
39 | "outputs": [],
40 | "source": [
41 | "user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18'\n",
42 | "headers = { 'User-Agent' : user_agent }"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "startlink = r'https://panorama.pub/category/news/page/'\n",
54 | "hreflist = []"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 19,
60 | "metadata": {
61 | "collapsed": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "def GetLinks(startlink, rang=84):\n",
66 | " lst = []\n",
67 | " for letter in range(1,rang):\n",
68 | " link = startlink +str(letter)+'/'\n",
69 | " lst.append(link)\n",
70 | " return lst"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 9,
76 | "metadata": {
77 | "collapsed": true
78 | },
79 | "outputs": [],
80 | "source": [
81 | "links = GetLinks(startlink)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 15,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "def GetTextLinks(link):\n",
91 | " lst = []\n",
92 | " r = requests.get(link,headers=headers)\n",
93 | " links = re.split(\"