├── README.md
└── main.py


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Python Scrapper Cosmos Mongo DB
 3 | 
 4 | The project will help you scrape the text from a website and will send it directly to a Cosmos Mongo Database on Azure
 5 | 
 6 | 
 7 | ## Installation
 8 | 
 9 | Install the dependencies
10 | 
11 | ```bash
12 | pip install requests
13 | pip install bs4
14 | pip install beautifulsoup4
15 | pip install pymongo
16 | ```
17 |     
18 | ## Author
19 | 
20 | - [@AntoineSmet](https://github.com/AntoineSmet)
21 | 
22 | ![Logo](https://cloudblogs.microsoft.com/industry-blog/uploads/industry/sites/22/2019/06/cosmosdbheader.jpg)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup as bs
 3 | import pymongo
 4 | import json
 5 | 
 6 | myclient = pymongo.MongoClient("mongodb://******************************")
 7 | #Your DataBase Name
 8 | mydb = myclient["MyDataBase"]
 9 | #Your Collection Name
10 | mycol = mydb["MyCollection"]
11 | 
12 | 
13 | #change the url to the one you want to scrape
14 | URL = 'WebSiteURL'
15 | number = 0
16 | #change the number to begin where you want to start
17 | page_begin = 1
18 | #change the number to the number of pages you want to scrape
19 | page_end = 230 + 1
20 | #open the file name where you want to save the data
21 | file_txt = open("data.txt", "w", encoding="UTF-8")
22 | 
23 | #if you want to scrape only one page, change the page_end to page_begin or delete the loop
24 | for page in range(page_begin, page_end):
25 |     #check the url for all pages you want to scrapp and add if there other thing that just the number of the page in url
26 |     req = requests.get(URL + str(page))
27 |     #filter the data by div like a funnel
28 |     soup = bs(req.text, 'html.parser')
29 |     #find the div with the class name and replace card it with the div you want to scrape
30 |     mother = soup.find('div', attrs={'class', 'card'})
31 |     sons = mother.find_all('div', attrs={'class', 'p-2'})
32 |     for son in sons:
33 |         number = number+1
34 |         #change the final name of the div you want to scrape
35 |         elements = son.find_all('a', attrs={'class', 'text-dark'})
36 |         #extract the text from the div
37 |         title = elements[0].text
38 |         #replace characters that you don't want in the text
39 |         title = title.replace('"', "'")
40 |         data_json = '''
41 |                 {
42 |                     "id": %s,
43 |                     "title": "%s"
44 |                 } 
45 |                 ''' % (number, title)
46 |         #transform the data in json
47 |         alpha = json.loads(data_json)
48 |         #insert the data in the database
49 |         mycol.insert_one(alpha)
50 |         #write like you want the data in the file
51 |         data_txt = "%s - %s\n" % (number, title)
52 |         file_txt.write(data_txt)
53 |         #print the result in the terminal just to check if it is working
54 |         print(data_txt)
55 | #close the file name where you want to save the data
56 | file_txt.close()


--------------------------------------------------------------------------------