├── README.md └── main.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Python Scrapper Cosmos Mongo DB 3 | 4 | The project will help you scrape the text from a website and will send it directly to a Cosmos Mongo Database on Azure 5 | 6 | 7 | ## Installation 8 | 9 | Install the dependencies 10 | 11 | ```bash 12 | pip install requests 13 | pip install bs4 14 | pip install beautifulsoup4 15 | pip install pymongo 16 | ``` 17 | 18 | ## Author 19 | 20 | - [@AntoineSmet](https://github.com/AntoineSmet) 21 | 22 | ![Logo](https://cloudblogs.microsoft.com/industry-blog/uploads/industry/sites/22/2019/06/cosmosdbheader.jpg) 23 | 24 | 25 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup as bs 3 | import pymongo 4 | import json 5 | 6 | myclient = pymongo.MongoClient("mongodb://******************************") 7 | #Your DataBase Name 8 | mydb = myclient["MyDataBase"] 9 | #Your Collection Name 10 | mycol = mydb["MyCollection"] 11 | 12 | 13 | #change the url to the one you want to scrape 14 | URL = 'WebSiteURL' 15 | number = 0 16 | #change the number to begin where you want to start 17 | page_begin = 1 18 | #change the number to the number of pages you want to scrape 19 | page_end = 230 + 1 20 | #open the file name where you want to save the data 21 | file_txt = open("data.txt", "w", encoding="UTF-8") 22 | 23 | #if you want to scrape only one page, change the page_end to page_begin or delete the loop 24 | for page in range(page_begin, page_end): 25 | #check the url for all pages you want to scrapp and add if there other thing that just the number of the page in url 26 | req = requests.get(URL + str(page)) 27 | #filter the data by div like a funnel 28 | soup = bs(req.text, 'html.parser') 29 | #find the div with the class name and replace card it with the div you want to scrape 30 | mother = soup.find('div', attrs={'class', 'card'}) 31 | sons = mother.find_all('div', attrs={'class', 'p-2'}) 32 | for son in sons: 33 | number = number+1 34 | #change the final name of the div you want to scrape 35 | elements = son.find_all('a', attrs={'class', 'text-dark'}) 36 | #extract the text from the div 37 | title = elements[0].text 38 | #replace characters that you don't want in the text 39 | title = title.replace('"', "'") 40 | data_json = ''' 41 | { 42 | "id": %s, 43 | "title": "%s" 44 | } 45 | ''' % (number, title) 46 | #transform the data in json 47 | alpha = json.loads(data_json) 48 | #insert the data in the database 49 | mycol.insert_one(alpha) 50 | #write like you want the data in the file 51 | data_txt = "%s - %s\n" % (number, title) 52 | file_txt.write(data_txt) 53 | #print the result in the terminal just to check if it is working 54 | print(data_txt) 55 | #close the file name where you want to save the data 56 | file_txt.close() --------------------------------------------------------------------------------