├── README.md ├── subtitles.ipynb ├── crawler_postnauka.ipynb ├── add_proza_ru.py ├── add_stihi_ru.py ├── youtube_crawl_demo.ipynb ├── crawler_vecher_moskva.ipynb └── иа_панорама.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # crawlers 2 | Краулеры для проекта Taiga Corpus и Taiga Parser, скачивание ресурсов из открытых источников 3 | 4 | Ресурсы к скачиванию: 5 | Новости 6 | 7 | Фонтанка (Оля) 8 | Ведомости (Оля) 9 | Известия (Оля) 10 | Интерфакс (Таня) 11 | Комсомольская Правда (Таня) 12 | Лента ру (Таня) 13 | Газета ру (Оля) 14 | 15 | Худлит 16 | 17 | Журнальный зал (Таня) 18 | 19 | Остальное 20 | 21 | Прожито ру (?) 22 | oral history (Оля) 23 | nplus1 (Таня) 24 | postnauka (Таня) 25 | Стихи ру (Таня) 26 | Проза ру (Таня) 27 | Арзамас (Оля) 28 | 29 | Все скачанные данными скриптами материалы необходимо проверять на наличие лишних тегов, достоверность метатекстовой разметки и дедублицировать отдельно. 30 | -------------------------------------------------------------------------------- /subtitles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 16, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import pysrt" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 17, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "\n", 24 | "for file in os.listdir(r\"/home/mi_air/Downloads/18 to Life\"):\n", 25 | " if file.endswith(\"ru.srt\"):\n", 26 | " \n", 27 | " filename = r\"/home/mi_air/Downloads/18 to Life/\" + file\n", 28 | " filenametxt = filename + \".txt\"\n", 29 | " out = open(filenametxt, \"w\", encoding=\"utf-8\")\n", 30 | " fl = open(filename, \"r\", encoding=\"utf-8\")\n", 31 | " out.write(fl.read())" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 22, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "126271\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "wordcount=0\n", 51 | "for file in os.listdir(r\"/home/mi_air/Downloads/18 to Life\"):\n", 52 | " if file.endswith(\"ru.srt.txt\"):\n", 53 | " filename = r\"/home/mi_air/Downloads/18 to Life/\" + file\n", 54 | " fl = open(filename, \"r\", encoding=\"utf-8\")\n", 55 | " for line in fl:\n", 56 | " wordlist = line.split(\" \")\n", 57 | " if line.isdigit()==False:\n", 58 | " #print(line)\n", 59 | " wordcount += len(wordlist)\n", 60 | "print (wordcount)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [] 71 | } 72 | ], 73 | "metadata": { 74 | "anaconda-cloud": {}, 75 | "kernelspec": { 76 | "display_name": "Python [default]", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.5.2" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } 96 | -------------------------------------------------------------------------------- /crawler_postnauka.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Краулер для постнауки,\n", 8 | "\n", 9 | "только чистые тексты FAQ\n", 10 | "848 публикаций\n", 11 | "\n", 12 | "https://postnauka.ru/faq" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# Импортируем необходимые библиотеки:\n", 24 | "import random\n", 25 | "import time\n", 26 | "import requests # http-запросы,\n", 27 | "import re # регулярные выражения,\n", 28 | "from bs4 import BeautifulSoup # удаление тегов html,\n", 29 | "from tqdm import tqdm # красотуля для анализа прогресса." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "#список всех ссылок\n", 41 | "hrefs = []\n", 42 | "out = open(r\"/home/mi_air/Downloads/spisok_postnauka_hrefs.txt\",\"w\", encoding=\"utf-8\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "start = 72417\n", 54 | "faq = \"https://postnauka.ru/faq/\"\n", 55 | "special = \"https://postnauka.ru/specials/\"" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def getHref(link):\n", 67 | " bl = 0\n", 68 | " r = requests.get(link)\n", 69 | " for line in r.text:\n", 70 | " if '25000:\n", 84 | " link = faq + str(start)\n", 85 | " if getHref(link)==1:\n", 86 | " hrefs.append(link)\n", 87 | " link2 = special + str(start)\n", 88 | " if getHref(link2)==1:\n", 89 | " hrefs.append(link2)\n", 90 | " start -= 1" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 8, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "hrefs = list(set(hrefs))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 9, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "for i in hrefs:\n", 113 | " out.write(i+\"\\n\")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "anaconda-cloud": {}, 128 | "kernelspec": { 129 | "display_name": "Python [default]", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.5.2" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 2 148 | } 149 | -------------------------------------------------------------------------------- /add_proza_ru.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | # Импортируем необходимые библиотеки: 7 | import random 8 | import time 9 | import requests # http-запросы, 10 | import os 11 | import re # регулярные выражения, 12 | from bs4 import BeautifulSoup # удаление тегов html, 13 | from tqdm import tqdm # красотуля для анализа прогресса. 14 | import unify 15 | 16 | 17 | # In[29]: 18 | 19 | genre_dic = {"миниатюры":"Малые формы",\ 20 | "новеллы":"Малые формы",\ 21 | "рассказы":"Малые формы",\ 22 | "репортажи":"Малые формы",\ 23 | "повести":"Крупные формы",\ 24 | "романы":"Крупные формы",\ 25 | "драматургия":"Жанровые произведения",\ 26 | "детективы":"Жанровые произведения",\ 27 | "приключения":"Жанровые произведения",\ 28 | "фантастика":"Жанровые произведения",\ 29 | "фэнтези":"Жанровые произведения",\ 30 | "ужасы":"Жанровые произведения",\ 31 | "киберпанк":"Жанровые произведения",\ 32 | "эротическая проза":"Жанровые произведения",\ 33 | "юмористическая проза":"Юмор",\ 34 | "ироническая проза":"Юмор",\ 35 | "фельетоны":"Юмор",\ 36 | "анекдоты":"Юмор",\ 37 | "байки":"Юмор",\ 38 | "история и политика":"Эссе и статьи",\ 39 | "литературоведение":"Эссе и статьи",\ 40 | "естествознание":"Эссе и статьи",\ 41 | "публицистика":"Эссе и статьи",\ 42 | "философия":"Эссе и статьи",\ 43 | "религия":"Эссе и статьи",\ 44 | "мистика":"Эссе и статьи",\ 45 | "мемуары":"Эссе и статьи",\ 46 | "критические статьи":"Литературная критика",\ 47 | "литературные обзоры":"Литературная критика",\ 48 | "музыкальные и кинообзоры":"Литературная критика",\ 49 | "литература для детей":"Детские разделы",\ 50 | "рассказы о детях":"Детские разделы",\ 51 | "сказки":"Детские разделы",\ 52 | "детское творчество":"Детские разделы",\ 53 | "стихи":"Поэзия",\ 54 | "стихотворения в прозе":"Поэзия",\ 55 | "литературные переводы":"Переводы и проза на других языках",\ 56 | "проза на других языках":"Переводы и проза на других языках"} 57 | 58 | 59 | # In[30]: 60 | 61 | rubrics = {"05":"миниатюры",\ 62 | "21":"новеллы",\ 63 | "02":"рассказы",\ 64 | "30":"репортажи",\ 65 | "01":"повести",\ 66 | "04":"романы",\ 67 | "13":"драматургия",\ 68 | "07":"детективы",\ 69 | "23":"приключения",\ 70 | "06":"фантастика",\ 71 | "24":"фэнтези",\ 72 | "25":"ужасы",\ 73 | "26":"киберпанк",\ 74 | "03":"эротическая проза",\ 75 | "08":"юмористическая проза",\ 76 | "16":"ироническая проза",\ 77 | "09":"фельетоны",\ 78 | "27":"анекдоты",\ 79 | "28":"байки",\ 80 | "31":"история и политика",\ 81 | "10":"литературоведение",\ 82 | "32":"естествознание",\ 83 | "11":"публицистика",\ 84 | "33":"философия",\ 85 | "34":"религия",\ 86 | "35":"мистика",\ 87 | "18":"мемуары",\ 88 | "12":"критические статьи",\ 89 | "41":"литературные обзоры",\ 90 | "42":"музыкальные и кинообзоры",\ 91 | "17":"литература для детей",\ 92 | "51":"рассказы о детях",\ 93 | "52":"сказки",\ 94 | "50":"детское творчество",\ 95 | "39":"стихи",\ 96 | "43":"стихотворения в прозе",\ 97 | "15":"литературные переводы",\ 98 | "44":"проза на других языках"} 99 | 100 | 101 | # In[31]: 102 | 103 | #задаем хэдеры - они понадобятся еще много раз 104 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18' 105 | headers = { 'User-Agent' : user_agent } 106 | 107 | 108 | # In[32]: 109 | 110 | def ensure_dir(directory): 111 | 112 | if not os.path.exists(directory): 113 | os.makedirs(directory, mode=0o777, exist_ok=False) 114 | return directory 115 | 116 | 117 | # In[25]: 118 | 119 | def make_daily_link(year, month, day, topic='all'): 120 | return 'http://www.proza.ru/poems/list.html?day='+str(day)+'&month='+str(month)+'&year='+str(year)+'&topic='+str(topic) 121 | 122 | 123 | # In[33]: 124 | 125 | def get_number_poems(link): 126 | r = requests.get(link, headers=headers) 127 | 128 | num = int(re.split(' по', re.split('

Произведения в обратном порядке с ', r.text)[1])[0]) 129 | return num 130 | 131 | 132 | # In[74]: 133 | 134 | def get_poem_links(link): 135 | #ссылка на текст, заголовок, имя автора, ссылка на автора, дата и время 136 | r = requests.get(link, headers=headers) 137 | allinks = re.split('"textlink nounline', re.split('опубликовать произведение', r.text)[1])[0].split('') 138 | textinfo = [] 139 | 140 | for part in allinks: 141 | 142 | if 'Авторские анонсы' not in part: 143 | for l in part.split('\n'): 144 | 145 | if "poemlink" in l: 146 | poemlink = 'http://www.proza.ru'+re.split('" ', re.split('

  • ', l)[1])[0] 148 | author = re.split('', re.split('class="authorlink">', l)[1])[0] 149 | authorlink = 'http://www.proza.ru/avtor/'+re.split('" class="authorlink', re.split('href="/avtor/', l)[1])[0] 150 | datetime = re.split('
  • ', re.split('- ', l)[1])[0] 151 | date = datetime.split()[0] 152 | time = datetime.split()[1] 153 | textinfo.append([poemlink, title, author, authorlink,date,time]) 154 | return textinfo 155 | 156 | 157 | # In[76]: 158 | 159 | def get_poem_links_by_date(daily_link): 160 | r = requests.get(daily_link, headers=headers) 161 | text_info = [] 162 | 163 | lines = r.text.split('\n') 164 | starts = ['http://www.proza.ru'+re.split('">', re.split('", re.split('
    ', r.text)[1])[0] 191 | # "Откусываем" оставшиеся теги. 192 | beaux_text=BeautifulSoup(text, "lxml") 193 | n_text = beaux_text.get_text() 194 | n_text = re.sub('\xa0', '', n_text) 195 | n_text = unify.unify_sym(n_text) 196 | return(n_text) 197 | 198 | 199 | # In[77]: 200 | 201 | #Теперь список авторов нам нужно превратить с список данных автора и ссылки на его тексты: 202 | def getAuthorInfo(authorlink): 203 | r = requests.get(authorlink, headers=headers) 204 | 205 | try: 206 | 207 | author_items = re.split("", re.split("Произведений: ", r.text)[1])[0] 208 | author_readeres = re.split("", re.split("Читателей: ", r.text)[1])[0] 209 | return author_items, author_readeres 210 | 211 | except: 212 | return '', '' 213 | 214 | 215 | # In[12]: 216 | 217 | WDIR = ensure_dir(r'/home/tsha/proza_ru/texts') 218 | 219 | 220 | # In[ ]: 221 | 222 | for year in range(2005,2008)[::-1]: 223 | metatable_texts = open(ensure_dir(r'/home/tsha/proza_ru/meta/'+str(year))+'/metatable_texts.txt', 'a', encoding='utf8') 224 | metatable_texts.write('textid\tURL\ttitle\tauthor\tauthorlink\tdate\ttime\tpath\tauthor_readers\tauthor_texts\ttopic\tgenre\n') 225 | #textid, poemlink, title, author, authorlink,date,time, path, author_readers,author_poems,topic,genre 226 | for month in range(1,13)[::-1]: 227 | if month < 10: 228 | month = "0" + str(month) 229 | path = ensure_dir(WDIR + "/"+str(year)+"/"+str(month)) 230 | for day in range(1, 32)[::-1]: 231 | if day < 10: 232 | day = "0" + str(day) 233 | if year==2007 and int(month)==12 and int(day)>=3 : 234 | pass 235 | else: 236 | for topic in rubrics: 237 | print(year, month, day,rubrics[topic] ) 238 | link = make_daily_link(year, month, day, topic) 239 | text_info = get_poem_links_by_date(link) 240 | 241 | #вот здесь по-другому 242 | for i in tqdm(range(len(text_info))): 243 | textid = str(year)+str(month)+str(day)+str(i)+str(topic) 244 | textlink = text_info[i][0] 245 | 246 | 247 | try: 248 | text = getTextStihi(textlink) 249 | textfile = open(os.path.join(path, textid+'.txt'), 'w', encoding='utf8') 250 | textfile.write(text) 251 | textfile.close() 252 | author_poems, author_readers = getAuthorInfo(text_info[i][3]) 253 | genre = genre_dic[rubrics[topic]] 254 | textfeats = [textid]+text_info[i] + [os.path.join(path, textid+'.txt'),author_poems, author_readers, topic, genre] 255 | metatable_texts.write("\t".join(textfeats)+'\n') 256 | except: 257 | continue 258 | print(textlink) 259 | metatable_texts.close() 260 | 261 | 262 | 263 | -------------------------------------------------------------------------------- /add_stihi_ru.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[3]: 5 | 6 | # Импортируем необходимые библиотеки: 7 | import random 8 | import time 9 | import requests # http-запросы, 10 | import os 11 | import re # регулярные выражения, 12 | from bs4 import BeautifulSoup # удаление тегов html, 13 | from tqdm import tqdm # красотуля для анализа прогресса. 14 | import unify 15 | 16 | 17 | # In[29]: 18 | 19 | genre_dic = {'любовная лирика': 'лирика', 'гражданская лирика': 'лирика лирика', 'пейзажная лирика': 'лирика','городская лирика': 'лирика','религиозная лирика': 'лирика','философская лирика': 'лирика','мистика и эзотерика': 'лирика', 'циклы стихов': 'Крупные формы', 'поэмы': 'Крупные формы','пьесы': 'Крупные формы','запад: сонеты, канцоны, рондо':'Твердые формы','без рубрики':'Стихи без рубрики', 'восток: рубаи, хокку, танка':'Твердые формы', 'акростихи':'Твердые формы', 'верлибр':'Свободные формы и проза', 'белый и вольный стих':'Свободные формы и проза', 'cтихотворения в прозе':'Свободные формы и проза', 'прозаические миниатюры':'Свободные формы и проза','эссе и статьи':'Свободные формы и проза', 'афоризмы':'Свободные формы и проза', 'пародии':'Пародии и юмор','подражания':'Пародии и юмор','шуточные стихи':'Пародии и юмор','иронические стихи':'Пародии и юмор','сатирические стихи':'Пародии и юмор','басни':'Пародии и юмор', 'стихи для детей':'Детские разделы','детское творчество':'Детские разделы','авторская песня':'Музыкальное творчество', 'эстрадная песня':'Музыкальное творчество','русский рок':'Музыкальное творчество','либретто':'Музыкальное творчество','шансон':'Музыкальное творчество', 'переводы песен':'Музыкальное творчество','переделки песен':'Музыкальное творчество','поэтические переводы':'Переводы и стихи на других языках', 'стихи на других языках':'Переводы и стихи на других языках','переводы песен':'Переводы и стихи на других языках','без рубрики':'Стихи без рубрики'} 20 | 21 | 22 | # In[30]: 23 | 24 | rubrics = {"01":"любовная лирика",\ 25 | "14":"гражданская лирика",\ 26 | "02":"пейзажная лирика",\ 27 | "08":"городская лирика",\ 28 | "19":"религиозная лирика",\ 29 | "17":"философская лирика",\ 30 | "13":"мистика и эзотерика",\ 31 | "04":"циклы стихов",\ 32 | "46":"поэмы",\ 33 | "47":"пьесы",\ 34 | "22":"запад: сонеты, канцоны, рондо",\ 35 | "18":"восток: рубаи, хокку, танка",\ 36 | "44":"акростихи",\ 37 | "32":"верлибр",\ 38 | "15":"белый и вольный стих",\ 39 | "39":"cтихотворения в прозе",\ 40 | "05":"прозаические миниатюры",\ 41 | "40":"эссе и статьи",\ 42 | "06":"афоризмы",\ 43 | "12":"пародии",\ 44 | "21":"подражания",\ 45 | "11":"шуточные стихи",\ 46 | "16":"иронические стихи",\ 47 | "34":"сатирические стихи",\ 48 | "41":"басни",\ 49 | "10":"стихи для детей",\ 50 | "50":"детское творчество",\ 51 | "23":"авторская песня",\ 52 | "24":"эстрадная песня",\ 53 | "25":"русский рок",\ 54 | "26":"либретто",\ 55 | "37":"шансон",\ 56 | "43":"переводы песен",\ 57 | "33":"переделки песен",\ 58 | "20":"поэтические переводы",\ 59 | "36":"стихи на других языках",\ 60 | "43":"переводы песен",\ 61 | "03":"без рубрики"} 62 | 63 | 64 | # In[31]: 65 | 66 | #задаем хэдеры - они понадобятся еще много раз 67 | user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18' 68 | headers = { 'User-Agent' : user_agent } 69 | 70 | 71 | # In[32]: 72 | 73 | def ensure_dir(directory): 74 | 75 | if not os.path.exists(directory): 76 | os.makedirs(directory, mode=0o777, exist_ok=True) 77 | return directory 78 | 79 | 80 | # In[25]: 81 | 82 | def make_daily_link(year, month, day, topic='all'): 83 | return 'http://www.stihi.ru/poems/list.html?day='+str(day)+'&month='+str(month)+'&year='+str(year)+'&topic='+str(topic) 84 | 85 | 86 | # In[33]: 87 | 88 | def get_number_poems(link): 89 | r = requests.get(link, headers=headers) 90 | 91 | num = int(re.split(' по', re.split('

    Произведения в обратном порядке с ', r.text)[1])[0]) 92 | return num 93 | 94 | 95 | # In[74]: 96 | 97 | def get_poem_links(link): 98 | #ссылка на текст, заголовок, имя автора, ссылка на автора, дата и время 99 | r = requests.get(link, headers=headers) 100 | allinks = re.split('"textlink nounline', re.split('опубликовать произведение', r.text)[1])[0].split('') 101 | textinfo = [] 102 | 103 | for part in allinks: 104 | 105 | if 'Авторские анонсы' not in part: 106 | for l in part.split('\n'): 107 | 108 | if "poemlink" in l: 109 | poemlink = 'http://www.stihi.ru'+re.split('" ', re.split('

  • ', l)[1])[0] 111 | author = re.split('', re.split('class="authorlink">', l)[1])[0] 112 | authorlink = 'http://www.stihi.ru/avtor/'+re.split('" class="authorlink', re.split('href="/avtor/', l)[1])[0] 113 | datetime = re.split('
  • ', re.split('- ', l)[1])[0] 114 | date = datetime.split()[0] 115 | time = datetime.split()[1] 116 | textinfo.append([poemlink, title, author, authorlink,date,time]) 117 | return textinfo 118 | 119 | 120 | # In[76]: 121 | 122 | def get_poem_links_by_date(daily_link): 123 | r = requests.get(daily_link, headers=headers) 124 | text_info = [] 125 | 126 | lines = r.text.split('\n') 127 | starts = ['http://www.stihi.ru'+re.split('">', re.split('", re.split('
    ', r.text)[1])[0] 154 | # "Откусываем" оставшиеся теги. 155 | beaux_text=BeautifulSoup(text, "lxml") 156 | n_text = beaux_text.get_text() 157 | n_text = re.sub('\xa0', '', n_text) 158 | n_text = unify.unify_sym(n_text) 159 | return(n_text) 160 | 161 | 162 | # In[77]: 163 | 164 | #Теперь список авторов нам нужно превратить с список данных автора и ссылки на его тексты: 165 | def getAuthorInfo(authorlink): 166 | r = requests.get(authorlink, headers=headers) 167 | 168 | try: 169 | 170 | author_items = re.split("", re.split("Произведений: ", r.text)[1])[0] 171 | author_readeres = re.split("", re.split("Читателей: ", r.text)[1])[0] 172 | return author_items, author_readeres 173 | 174 | except: 175 | return '', '' 176 | 177 | 178 | # In[12]: 179 | 180 | WDIR = ensure_dir(r'/home/tsha/stihi_ru/texts') 181 | 182 | 183 | # In[ ]: 184 | 185 | for year in range(2005,2016)[::-1]: 186 | metatable_texts = open(ensure_dir(r'/home/tsha/stihi_ru/meta/'+str(year))+'/metatable_texts.txt', 'a', encoding='utf8') 187 | metatable_texts.write('textid\tURL\ttitle\tauthor\tauthorlink\tdate\ttime\tpath\tauthor_readers\tauthor_poems\ttopic\tgenre\n') 188 | #textid, poemlink, title, author, authorlink,date,time, path, author_readers,author_poems,topic,genre 189 | for month in range(1,13)[::-1]: 190 | if month < 10: 191 | month = "0" + str(month) 192 | path = ensure_dir(WDIR + "/"+str(year)+"/"+str(month)) 193 | for day in range(1, 32)[::-1]: 194 | if day < 10: 195 | day = "0" + str(day) 196 | if year==2015 and int(month)==12 : 197 | pass 198 | elif year==2015 and int(month)==11 and int(day)>=11: 199 | pass 200 | else: 201 | for topic in rubrics: 202 | print(year, month, day,rubrics[topic] ) 203 | link = make_daily_link(year, month, day, topic) 204 | text_info = get_poem_links_by_date(link) 205 | 206 | #вот здесь по-другому 207 | for i in tqdm(range(len(text_info))): 208 | textid = str(year)+str(month)+str(day)+str(i)+str(topic) 209 | textlink = text_info[i][0] 210 | 211 | 212 | try: 213 | text = getTextStihi(textlink) 214 | textfile = open(os.path.join(path, textid+'.txt'), 'w', encoding='utf8') 215 | textfile.write(text) 216 | textfile.close() 217 | author_poems, author_readers = getAuthorInfo(text_info[i][3]) 218 | genre = genre_dic[rubrics[topic]] 219 | textfeats = [textid]+text_info[i] + [os.path.join(path, textid+'.txt'),author_poems, author_readers, topic, genre] 220 | metatable_texts.write("\t".join(textfeats)+'\n') 221 | except: 222 | continue 223 | print(textlink) 224 | metatable_texts.close() 225 | 226 | 227 | # In[ ]: 228 | 229 | 230 | 231 | -------------------------------------------------------------------------------- /youtube_crawl_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/mi_air/.local/lib/python3.5/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.24.1) or chardet (2.3.0) doesn't match a supported version!\n", 13 | " RequestsDependencyWarning)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from __future__ import print_function\n", 19 | "\n", 20 | "import os\n", 21 | "import sys\n", 22 | "import time\n", 23 | "import json\n", 24 | "import requests\n", 25 | "import lxml.html\n", 26 | "import io\n", 27 | "from tqdm import tqdm\n", 28 | "from lxml.cssselect import CSSSelector\n", 29 | "\n", 30 | "#used to make the Browser Working\n", 31 | "from selenium import webdriver\n", 32 | "#Send keycodes to Elements\n", 33 | "from selenium.webdriver.common.keys import Keys\n", 34 | "#scrape the url's and comments\n", 35 | "from bs4 import BeautifulSoup\n", 36 | "\n", 37 | "import re\n", 38 | "import datetime\n", 39 | "import time\n", 40 | "import codecs\n", 41 | "\n", 42 | "\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 5, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "\n", 54 | "\n", 55 | "YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'\n", 56 | "YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'\n", 57 | "\n", 58 | "USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "'2019-04-10'" 70 | ] 71 | }, 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "str(datetime.date.today())" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "{'/watch?v=ew-8i7UUpLY', '/watch?v=oOxDLuTXyCo', '/watch?v=32Rj7dv2IRE', '/watch?v=RtSS0SJE8oE', '/watch?v=N0CGM956z18', '/watch?v=fhCLQNYowcE', '/watch?v=AbXahmBSLTk', '/watch?v=ocekebVtZvw', '/watch?v=JM-GwDh73Wc', '/watch?v=BLHrjzTEr0c', '/watch?v=H6Kl8kheGBg', '/watch?v=cnn-z4U_S50', '/watch?v=F1B9Fk_SgI0', '/watch?v=qr4AaWAkf34', '/watch?v=knBhDpMXsQo', '/watch?v=DEfgiRorfbM', '/watch?v=DVH0CzurtIE', '/watch?v=p1JPKLa-Ofc', '/watch?v=nhcDl3S5sXQ', '/watch?v=TcMBFSGVi1c', '/watch?v=b52lho8lD6Q', '/watch?v=kPg3M4C9N9w', '/watch?v=XlmaJ-yU46U', '/watch?v=U_90XNCBatY', '/watch?v=6pdfriFuFt8', '/watch?v=vw2SaHkGfss', '/watch?v=yzZIS1TtXjw', '/watch?v=q97nD5dOS5M', '/watch?v=31OnT5iSLA0', '/watch?v=3y-O-4IL-PU', '/watch?v=RzfO1FbUCo8', '/watch?v=IRKwwk7CXBQ', '/watch?v=ZD9OkKE0TfA', '/watch?v=35adpxPiNlU', '/watch?v=4yXU8K-9SIw', '/watch?v=nDq6TstdEi8', '/watch?v=3XNDaISqFX8', '/watch?v=3p1fHBNILhM', '/watch?v=W1j28DRcFBQ', '/watch?v=qywZ6lUcNo8', '/watch?v=_XFzT9GMmw8', '/watch?v=hE2Ira-Cwxo', '/watch?v=A8N4_cjLXH8', '/watch?v=kvvLXVDYl6I', '/watch?v=vEUlnLOQG8k', '/watch?v=ssbNmaOmVMk', '/watch?v=XW_KhFq4LQo', '/watch?v=CaBq3SvO0a4', '/watch?v=6Z6zfRWTotY', '/watch?v=3t195yz9xCc', '/watch?v=YbiKtZSqmB4', '/watch?v=wzjWIxXBs_s', '/watch?v=4D8ezH0iXh8', '/watch?v=cyzqxRHLPpk', '/watch?v=CiL-yTNa6QY', '/watch?v=eKmRkS1os7k', '/watch?v=ufI6DCB6X2U', '/watch?v=c18WvLeJn-I', '/watch?v=f30Jq8BQPQo', '/watch?v=7gvqArR7nlA', '/watch?v=el00pNoRB34', '/watch?v=Ba44js56nF4', '/watch?v=gporsZ8WnsM', '/watch?v=P2qOZDuiYlM', '/watch?v=x865r5EqKDo', '/watch?v=PHgc8Q6qTjc', '/watch?v=TwFvvcHf7Dw', '/watch?v=_nf8GV0AvtI', '/watch?v=AKzFFJXMDyE', '/watch?v=iP0MrLN4xso', '/watch?v=o39KwSswsgw', '/watch?v=iloh1SUe42g', '/watch?v=2apVwq-pX9E', '/watch?v=u4x9YyRnFDE', '/watch?v=hsGOT_0L16U', '/watch?v=kO9bzwqCNgo', '/watch?v=IRUihzQvBMo', '/watch?v=zUyH3XhpLTo', '/watch?v=DFia7FhVmuM', '/watch?v=buCD-_1UPn4', '/watch?v=mFlrc16xjik', '/watch?v=qcGNoZ3r9t8', '/watch?v=XmAsgB4EMR8', '/watch?v=KCSNFZKbhZE', '/watch?v=nMfPqeZjc2c', '/watch?v=eEd2K1FxNQY', '/watch?v=3NycM9lYdRI', '/watch?v=gmU9PBDS-0k', '/watch?v=3fEdoqHCaM8', '/watch?v=jCC8fPQOaxU', '/watch?v=VUArb3AIpm4', '/watch?v=z6buCeA4ZSc', '/watch?v=K3Qzzggn--s', '/watch?v=jwxI0OX3GsA', '/watch?v=xcg_e-FY_Vs', '/watch?v=nvRjW2oYBiU', '/watch?v=b2AcxL88DoI', '/watch?v=ZeTWW47yhC4', '/watch?v=sxt4YCIsn2I', '/watch?v=S1gp0m4B5p8', '/watch?v=laoUmXqscdk', '/watch?v=hC8CH0Z3L54', '/watch?v=eQHo2zo58no', '/watch?v=lZIq7A9zKFs', '/watch?v=C5Gm8UvxKlU', '/watch?v=2KBFD0aoZy8', '/watch?v=JKeG1iJNxGs', '/watch?v=OdV6SkGZb3g', '/watch?v=g9bzrGBzSC4', '/watch?v=b5W9t62t10I', '/watch?v=66Ki5_-E0n4', '/watch?v=yMRoNNKWuqQ', '/watch?v=4cx9apL7HhY', '/watch?v=9DzSGPad_z4', '/watch?v=4H9jTQKmR3Q', '/watch?v=4zI6guqVqiI', '/watch?v=sCD9zjf_YRU', '/watch?v=jfjfzKf85Ac', '/watch?v=Yxnsxg4rs0E', '/watch?v=lFcSrYw-ARY', '/watch?v=1ZYbU82GVz4', '/watch?v=1nnRC6jDOCI', '/watch?v=7Jj83FOlBF8', '/watch?v=L5cLq1mIC70', '/watch?v=4NcoqtHH2IE', '/watch?v=jJys1BM8x8k', '/watch?v=tF0uHeLy1v0', '/watch?v=gXKPjSkCSMM', '/watch?v=9uIk_91GQYI', '/watch?v=njHvGxZgTPk', '/watch?v=l8kLiUZDbQ4', '/watch?v=t433PEQGErc', '/watch?v=GRTS9yZJREk', '/watch?v=emKhAptPqg4', '/watch?v=Z-0FXUgVsVs', '/watch?v=CX17qmYO0o0', '/watch?v=WzfRhSU9_qA', '/watch?v=kHkKihbfsXQ', '/watch?v=GMFewiplIbw', '/watch?v=xpVfcZ0ZcFM', '/watch?v=x4o5g_PGkiA', '/watch?v=qvzW_CJTlmM', '/watch?v=VfNvJs7-RM4', '/watch?v=LS_-ZMcGnow', '/watch?v=r34Isj_erU4', '/watch?v=zXtsGAkyeIo', '/watch?v=tKMmMHyLBCE', '/watch?v=0fUMyQlzujU', '/watch?v=4IrkawvzGE8', '/watch?v=k4YRWT_Aldo', '/watch?v=-UOMvxh4MYU', '/watch?v=NwSIgDKvMHk', '/watch?v=au2n7VVGv_c', '/watch?v=HmH4W8JOifg', '/watch?v=IV6IuCTg6MU', '/watch?v=8xQrvclhJrU', '/watch?v=RbMqcFvtMN8', '/watch?v=5nxD4PY39xw', '/watch?v=jC7eeYwKrg0', '/watch?v=txQ6t4yPIM0', '/watch?v=WPni755-Krg', '/watch?v=NymS69shfkc', '/watch?v=_YuMfMLC8FA', '/watch?v=jHcbLgNQ4Co', '/watch?v=wi0q0y7U75c', '/watch?v=JeTzND6XrB0', '/watch?v=r8EF3X8EI2o', '/watch?v=s2Gw6r6HooA', '/watch?v=9QbltzIUV6w', '/watch?v=B8yo1HPW2O4', '/watch?v=JnfP9qKAbk8', '/watch?v=o7W7OvETO40', '/watch?v=vjjS92Q0lYs', '/watch?v=mzs7lmETE90', '/watch?v=qO2Y6BHYhHw', '/watch?v=HO2AJneTjAM', '/watch?v=GFEcOvs6YWk', '/watch?v=rTKodwXQi78', '/watch?v=57p6K-5ZSNc', '/watch?v=-twm7ldMOtI', '/watch?v=nvm6RzVLjWo', '/watch?v=J3UXp9jIr-U', '/watch?v=Cfd6PknS0Fw', '/watch?v=waU75jdUnYw', '/watch?v=ERUugjLmwuY', '/watch?v=MikD7plCDQg', '/watch?v=zxeTC0wKPXs', '/watch?v=OjWsugnahJ0', '/watch?v=zS-Og_RfdNc', '/watch?v=eDuRoPIOBjE', '/watch?v=23e9_o5rxsA', '/watch?v=STZZso9GUhA', '/watch?v=nEDKVNoE2ws', '/watch?v=m8UQ4O7UiDs', '/watch?v=EIeUJcP3T0Q', '/watch?v=7ysFgElQtjI', '/watch?v=jsHX1cFL41w', '/watch?v=qOXXvttM-e8', '/watch?v=9z1nTwP2n0w', '/watch?v=LH4Y1ZUUx2g', '/watch?v=0HpYEZ86Wuc', '/watch?v=2sap-GTtCiU', '/watch?v=8Ap7aJsfaXQ', '/watch?v=IPPYI64aHno', '/watch?v=ygyz3Mqjh0k', '/watch?v=-QKP4iVCaiY', '/watch?v=-bmA1D00B4o', '/watch?v=01ouUdAEFdU', '/watch?v=wWdXfX4Vpm8'}\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "\n", 96 | "# The List where the links to the videos are stored\n", 97 | "links = set()\n", 98 | "\n", 99 | "comments = list()\n", 100 | "\n", 101 | "homePage = 'https:www.youtube.com'\n", 102 | "linksSize = 10\n", 103 | "driver = webdriver.Firefox()\n", 104 | "\n", 105 | "output = open(\"/media/mi_air/0F0B7DDE62EEA81E/youtube/\"+str(datetime.date.today())+\".txt\",\"w\")\n", 106 | "\n", 107 | "def loadFullPage(Timeout):\n", 108 | " reachedbottom = None\n", 109 | " while not reachedbottom:\n", 110 | " #scroll one pane down\n", 111 | " driver.execute_script(\"window.scrollTo(0,Math.max(document.documentElement.scrollHeight,document.body.scrollHeight,document.documentElement.clientHeight));\");\n", 112 | " time.sleep(Timeout)\n", 113 | " #check if the bottom is reached\n", 114 | " a = driver.execute_script(\"return document.documentElement.scrollTop;\")\n", 115 | " b = driver.execute_script(\"return document.documentElement.scrollHeight - document.documentElement.clientHeight;\")\n", 116 | " relativeHeight = a / b\n", 117 | " if(relativeHeight==1):\n", 118 | " reachedbottom = True\n", 119 | "def getComments(link):\n", 120 | " driver.get(url='https:youtube.com'+link)\n", 121 | " loadFullPage(1)\n", 122 | "\n", 123 | "\n", 124 | "def main():\n", 125 | " driver.get(url=homePage)\n", 126 | " enoughLinks = None\n", 127 | "\n", 128 | " while not enoughLinks:\n", 129 | " loadFullPage(1)\n", 130 | "\n", 131 | " soup = BeautifulSoup(driver.page_source, 'html.parser')\n", 132 | "\n", 133 | " for link in soup.find_all(\"a\",class_=\"yt-simple-endpoint style-scope ytd-grid-video-renderer\", href=True):\n", 134 | " if link not in links:\n", 135 | " links.add(link['href'])\n", 136 | "\n", 137 | " if len(links) < linksSize:\n", 138 | " driver.refresh()\n", 139 | " else:\n", 140 | " #for i in range(len(links)-1000):\n", 141 | " #links.pop()\n", 142 | " enoughLinks = True\n", 143 | "\n", 144 | " #links.sort()\n", 145 | " for link in links:\n", 146 | " output.write(link)\n", 147 | " output.write(\"\\n\")\n", 148 | " output.close()\n", 149 | " print(links)\n", 150 | "\n", 151 | "\n", 152 | "if __name__ == '__main__':\n", 153 | " main()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "def find_value(html, key, num_chars=2):\n", 165 | " pos_begin = html.find(key) + len(key) + num_chars\n", 166 | " pos_end = html.find('\"', pos_begin)\n", 167 | " return html[pos_begin: pos_end]\n", 168 | "\n", 169 | "\n", 170 | "def extract_comments(html):\n", 171 | " tree = lxml.html.fromstring(html)\n", 172 | " item_sel = CSSSelector('.comment-item')\n", 173 | " text_sel = CSSSelector('.comment-text-content')\n", 174 | " time_sel = CSSSelector('.time')\n", 175 | " author_sel = CSSSelector('.user-name')\n", 176 | "\n", 177 | " for item in item_sel(tree):\n", 178 | " yield {'cid': item.get('data-cid'),\n", 179 | " 'text': text_sel(item)[0].text_content(),\n", 180 | " 'time': time_sel(item)[0].text_content().strip(),\n", 181 | " 'author': author_sel(item)[0].text_content()}\n", 182 | "\n", 183 | "\n", 184 | "def extract_reply_cids(html):\n", 185 | " tree = lxml.html.fromstring(html)\n", 186 | " sel = CSSSelector('.comment-replies-header > .load-comments')\n", 187 | " return [i.get('data-cid') for i in sel(tree)]\n", 188 | "\n", 189 | "\n", 190 | "def ajax_request(session, url, params, data, retries=10, sleep=20):\n", 191 | " for _ in range(retries):\n", 192 | " response = session.post(url, params=params, data=data)\n", 193 | " if response.status_code == 200:\n", 194 | " response_dict = json.loads(response.text)\n", 195 | " return response_dict.get('page_token', None), response_dict['html_content']\n", 196 | " else:\n", 197 | " time.sleep(sleep)\n", 198 | "\n", 199 | "\n", 200 | "def download_comments(youtube_id, sleep=1):\n", 201 | " session = requests.Session()\n", 202 | " session.headers['User-Agent'] = USER_AGENT\n", 203 | "\n", 204 | " # Get Youtube page with initial comments\n", 205 | " response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))\n", 206 | " html = response.text\n", 207 | " reply_cids = extract_reply_cids(html)\n", 208 | "\n", 209 | " ret_cids = []\n", 210 | " for comment in extract_comments(html):\n", 211 | " ret_cids.append(comment['cid'])\n", 212 | " yield comment\n", 213 | "\n", 214 | " page_token = find_value(html, 'data-token')\n", 215 | " session_token = find_value(html, 'XSRF_TOKEN', 4)\n", 216 | "\n", 217 | " first_iteration = True\n", 218 | "\n", 219 | " # Get remaining comments (the same as pressing the 'Show more' button)\n", 220 | " while page_token:\n", 221 | " data = {'video_id': youtube_id,\n", 222 | " 'session_token': session_token}\n", 223 | "\n", 224 | " params = {'action_load_comments': 1,\n", 225 | " 'order_by_time': True,\n", 226 | " 'filter': youtube_id}\n", 227 | "\n", 228 | " if first_iteration:\n", 229 | " params['order_menu'] = True\n", 230 | " else:\n", 231 | " data['page_token'] = page_token\n", 232 | "\n", 233 | " response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)\n", 234 | " if not response:\n", 235 | " break\n", 236 | "\n", 237 | " page_token, html = response\n", 238 | "\n", 239 | " reply_cids += extract_reply_cids(html)\n", 240 | " for comment in extract_comments(html):\n", 241 | " if comment['cid'] not in ret_cids:\n", 242 | " ret_cids.append(comment['cid'])\n", 243 | " yield comment\n", 244 | "\n", 245 | " first_iteration = False\n", 246 | " time.sleep(sleep)\n", 247 | "\n", 248 | " # Get replies (the same as pressing the 'View all X replies' link)\n", 249 | " for cid in reply_cids:\n", 250 | " data = {'comment_id': cid,\n", 251 | " 'video_id': youtube_id,\n", 252 | " 'can_reply': 1,\n", 253 | " 'session_token': session_token}\n", 254 | "\n", 255 | " params = {'action_load_replies': 1,\n", 256 | " 'order_by_time': True,\n", 257 | " 'filter': youtube_id,\n", 258 | " 'tab': 'inbox'}\n", 259 | "\n", 260 | " response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)\n", 261 | " if not response:\n", 262 | " break\n", 263 | "\n", 264 | " _, html = response\n", 265 | "\n", 266 | " for comment in extract_comments(html):\n", 267 | " if comment['cid'] not in ret_cids:\n", 268 | " ret_cids.append(comment['cid'])\n", 269 | " yield comment\n", 270 | " time.sleep(sleep)\n", 271 | "\n", 272 | "\n", 273 | "def main(youtube_id, output, limit=100):\n", 274 | "\n", 275 | " try:\n", 276 | "\n", 277 | " if not youtube_id or not output:\n", 278 | " parser.print_usage()\n", 279 | " raise ValueError('you need to specify a Youtube ID and an output filename')\n", 280 | "\n", 281 | " print('Downloading Youtube comments for video:', youtube_id)\n", 282 | " count = 0\n", 283 | " with io.open(output, 'w', encoding='utf8') as fp:\n", 284 | " for comment in download_comments(youtube_id):\n", 285 | " sys.stdout.write(json.dumps(comment, ensure_ascii=False))\n", 286 | " count += 1\n", 287 | " sys.stdout.write('Downloaded %d comment(s)\\r' % count)\n", 288 | " sys.stdout.flush()\n", 289 | " if limit and count >= limit:\n", 290 | " break\n", 291 | " print('\\nDone!')\n", 292 | "\n", 293 | "\n", 294 | " except Exception as e:\n", 295 | " print('Error:', str(e))\n", 296 | " sys.exit(1)\n", 297 | "\n" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stderr", 307 | "output_type": "stream", 308 | "text": [ 309 | "\n", 310 | " 0%| | 0/10 [00:00: Failed to establish a new connection: [Errno 111] Connection refused',))", 61 | "output_type": "error", 62 | "traceback": [ 63 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 64 | "\u001b[0;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", 65 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 137\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 138\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 66 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 67 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 68 | "\u001b[0;31mConnectionRefusedError\u001b[0m: [Errno 111] Connection refused", 69 | "\nDuring handling of the above exception, another exception occurred:\n", 70 | "\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)", 71 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 72 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 350\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 351\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 73 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 834\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 835\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 74 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 75 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 146\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 147\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 76 | "\u001b[0;31mNewConnectionError\u001b[0m: : Failed to establish a new connection: [Errno 111] Connection refused", 77 | "\nDuring handling of the above exception, another exception occurred:\n", 78 | "\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)", 79 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 423\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 424\u001b[0m )\n", 80 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 642\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 643\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 644\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 81 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 82 | "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))", 83 | "\nDuring handling of the above exception, another exception occurred:\n", 84 | "\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)", 85 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mlink\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"https://vm.ru/news/2017/02/03/\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgetHrefs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 86 | "\u001b[0;32m\u001b[0m in \u001b[0;36mgetHrefs\u001b[0;34m(link)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# Получаем текст страницы, которая содержит ссылки на все статьи этого дня (в примере - 03.02.2017).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mendpage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'
    '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'
    '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#листаем все новости дня\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mendpage\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 87 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 88 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 89 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 486\u001b[0m }\n\u001b[1;32m 487\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 488\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 489\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 90 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 609\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 611\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 91 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 92 | "\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "link = \"https://vm.ru/news/2017/02/03/\"\n", 98 | "print(getHrefs(link))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 11, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "def getHrefs(link):\n", 110 | "\n", 111 | " # Получаем текст страницы, которая содержит ссылки на все статьи этого дня (в примере - 03.02.2017).\n", 112 | " r = requests.get(link,headers=headers)\n", 113 | " endpage = int(re.split('
    ', re.split('
    ', r.text)[1])[0]) #листаем все новости дня\n", 114 | " for i in range(2,endpage+1):\n", 115 | " newlink = link+'?page=' + str(i)\n", 116 | " # Каждая ссылка на статью оформлена с помощью тега
    \n", 117 | " refs=re.split('', re.split('
      ', r.text)[1])[0])\n", 118 | " for i in refs:\n", 119 | " if i.startswith(\"/news/\"):\n", 120 | " ilink = re.split('\">',i)[0]\n", 121 | " print(ilink)\n", 122 | " hreffile.write(\"https://vm.ru/\"+ilink+ \"\\n\")\n", 123 | " time.sleep(random.uniform(1,2))\n", 124 | " \n" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 16, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [ 134 | { 135 | "ename": "ConnectionError", 136 | "evalue": "HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))", 137 | "output_type": "error", 138 | "traceback": [ 139 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 140 | "\u001b[0;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", 141 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 137\u001b[0m conn = connection.create_connection(\n\u001b[0;32m--> 138\u001b[0;31m (self.host, self.port), self.timeout, **extra_kw)\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 142 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 143 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 144 | "\u001b[0;31mConnectionRefusedError\u001b[0m: [Errno 111] Connection refused", 145 | "\nDuring handling of the above exception, another exception occurred:\n", 146 | "\u001b[0;31mNewConnectionError\u001b[0m Traceback (most recent call last)", 147 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m chunked=chunked)\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 148 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 350\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 351\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 149 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[0;34m(self, conn)\u001b[0m\n\u001b[1;32m 834\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sock'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# AppEngine might not have `.sock`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 835\u001b[0;31m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 150 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;31m# Add certificate verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_conn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 151 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connection.py\u001b[0m in \u001b[0;36m_new_conn\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 146\u001b[0m raise NewConnectionError(\n\u001b[0;32m--> 147\u001b[0;31m self, \"Failed to establish a new connection: %s\" % e)\n\u001b[0m\u001b[1;32m 148\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 152 | "\u001b[0;31mNewConnectionError\u001b[0m: : Failed to establish a new connection: [Errno 111] Connection refused", 153 | "\nDuring handling of the above exception, another exception occurred:\n", 154 | "\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)", 155 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 423\u001b[0;31m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 424\u001b[0m )\n", 156 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)\u001b[0m\n\u001b[1;32m 642\u001b[0m retries = retries.increment(method, url, error=e, _pool=self,\n\u001b[0;32m--> 643\u001b[0;31m _stacktrace=sys.exc_info()[2])\n\u001b[0m\u001b[1;32m 644\u001b[0m \u001b[0mretries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 157 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py\u001b[0m in \u001b[0;36mincrement\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 158 | "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))", 159 | "\nDuring handling of the above exception, another exception occurred:\n", 160 | "\u001b[0;31mConnectionError\u001b[0m Traceback (most recent call last)", 161 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madapters\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFAULT_RETRIES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"https://vm.ru/news/2017/02/03/\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mendpage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'
    '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'
    '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#листаем все новости дня\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mendpage\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mnewlink\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlink\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'?page='\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 162 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(url, params, **kwargs)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'allow_redirects'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 163 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/api.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 164 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 486\u001b[0m }\n\u001b[1;32m 487\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 488\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 489\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 165 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/sessions.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0;31m# Send the request\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 609\u001b[0;31m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 611\u001b[0m \u001b[0;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 166 | "\u001b[0;32m/home/mi_air/ioSavoy5/lib/python3.5/site-packages/requests/adapters.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mProxyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mClosedPoolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 167 | "\u001b[0;31mConnectionError\u001b[0m: HTTPSConnectionPool(host='vm.ru', port=443): Max retries exceeded with url: /news/2017/02/03/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused',))" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "requests.adapters.DEFAULT_RETRIES = 5\n", 173 | "r = requests.get(\"https://vm.ru/news/2017/02/03/\",headers=headers)\n", 174 | "endpage = int(re.split('
    ', re.split('
    ', r.text)[1])[0]) #листаем все новости дня\n", 175 | "for i in range(2,endpage+1):\n", 176 | " newlink = link+'?page=' + str(i)\n", 177 | "\n", 178 | "refs=re.split('', re.split('