├── README.md └── getquestions.py /README.md: -------------------------------------------------------------------------------- 1 | # stackoverflow 2 | 3 | Using stackoverflow as an example this code allows us to create one function that can scrape multiple pages of the same website. Code from YouTube demo. 4 | -------------------------------------------------------------------------------- /getquestions.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | 5 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} 6 | 7 | questionlist = [] 8 | 9 | def getQuestions(tag, page): 10 | url = f'https://stackoverflow.com/questions/tagged/{tag}?tab=Active&page={page}&pagesize=50' 11 | r = requests.get(url, headers=headers) 12 | soup = BeautifulSoup(r.text, 'html.parser') 13 | questions = soup.find_all('div', {'class': 'question-summary'}) 14 | for item in questions: 15 | question = { 16 | 'tag': tag, 17 | 'title': item.find('a', {'class': 'question-hyperlink'}).text, 18 | 'link': 'https://stackoverflow.com' + item.find('a', {'class': 'question-hyperlink'})['href'], 19 | 'votes': int(item.find('span', {'class': 'vote-count-post'}).text), 20 | 'date': item.find('span', {'class': 'relativetime'})['title'], 21 | } 22 | questionlist.append(question) 23 | return 24 | 25 | for x in range(1,3): 26 | getQuestions('python', x) 27 | getQuestions('flask', x) 28 | 29 | df = pd.DataFrame(questionlist) 30 | df.to_excel('stackquestions.xlsx', index=False) 31 | print('Fin.') 32 | --------------------------------------------------------------------------------