├── README.md
└── getquestions.py


/README.md:
--------------------------------------------------------------------------------
1 | # stackoverflow
2 | 
3 | Using stackoverflow as an example this code allows us to create one function that can scrape multiple pages of the same website. Code from YouTube demo.
4 | 


--------------------------------------------------------------------------------
/getquestions.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | 
 5 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
 6 | 
 7 | questionlist = []
 8 | 
 9 | def getQuestions(tag, page):
10 |     url = f'https://stackoverflow.com/questions/tagged/{tag}?tab=Active&page={page}&pagesize=50'
11 |     r = requests.get(url, headers=headers)
12 |     soup = BeautifulSoup(r.text, 'html.parser')
13 |     questions = soup.find_all('div', {'class': 'question-summary'})
14 |     for item in questions:
15 |         question = {
16 |         'tag': tag,    
17 |         'title': item.find('a', {'class': 'question-hyperlink'}).text,
18 |         'link': 'https://stackoverflow.com' + item.find('a', {'class': 'question-hyperlink'})['href'],
19 |         'votes': int(item.find('span', {'class': 'vote-count-post'}).text),
20 |         'date': item.find('span', {'class': 'relativetime'})['title'],
21 |         }
22 |         questionlist.append(question)
23 |     return
24 | 
25 | for x in range(1,3):
26 |     getQuestions('python', x)
27 |     getQuestions('flask', x)
28 | 
29 | df = pd.DataFrame(questionlist)
30 | df.to_excel('stackquestions.xlsx', index=False)
31 | print('Fin.')
32 | 


--------------------------------------------------------------------------------