└── scraper.py /scraper.py: -------------------------------------------------------------------------------- 1 | import string 2 | import requests 3 | import os 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | def search(usr_url, temp_article): 8 | links = [] 9 | r = requests.get(usr_url, headers={'Accept-Language': 'en-US,en;q=0.5'}) 10 | soup = BeautifulSoup(r.content, 'html.parser') 11 | articles = soup.findAll('article') 12 | for article in articles: 13 | article_type = article.find('span', {'data-test': 'article.type'}).text.strip() 14 | if article_type == temp_article: 15 | links.append(f"https://nature.com{article.find('a')['href']}") 16 | # print(article.find('a')['href']) 17 | # print(article.find('a').get('href')) 18 | return links 19 | 20 | 21 | def fetch_data_url(url_): 22 | content = '' 23 | page = requests.get(url_, headers={'Accept-Language': 'en-US,en;q=0.5'}) 24 | soup = BeautifulSoup(page.content, 'html.parser') 25 | # print(soup.find('title').text) 26 | title = soup.find('title').text 27 | new_title = "".join([char for char in title if char not in string.punctuation]).replace(" ", "_") 28 | new_title = f"{new_title}.txt" 29 | 30 | div_all = soup.findAll('div') 31 | for elem in div_all: 32 | if elem.has_attr('class'): 33 | # print(elem['class']) 34 | if any('body' in i for i in elem['class']): 35 | content = elem.text.strip() 36 | # print(content) 37 | 38 | if page.status_code == 200: 39 | with open(new_title, 'wb') as file: 40 | file.write(content.encode('utf-8')) 41 | return new_title 42 | else: 43 | return f'The URL returned {page.status_code}!' 44 | 45 | 46 | if __name__ == '__main__': 47 | url_template = 'https://www.nature.com/nature/articles?sort=PubDate&year=2020&page=' 48 | pages = int(input()) 49 | article_ = input() 50 | file_list = [] 51 | for i in range(1, pages + 1): 52 | os.mkdir(f'Page_{i}') 53 | os.chdir(f'Page_{i}') 54 | url = f'{url_template}{i}' 55 | print(url) 56 | print(article_) 57 | for link in search(url, article_): 58 | file_list.append(fetch_data_url(link)) 59 | os.chdir('..') 60 | print(f'Saved all articles: {file_list}') 61 | --------------------------------------------------------------------------------