├── images
    ├── home.JPG
    ├── nextpage.JPG
    ├── result.JPG
    └── README.md
├── README.md
├── LICENSE
└── moneycontrol_scrapper.py


/images/home.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pratik-choudhari/Financial-news-scraper/HEAD/images/home.JPG


--------------------------------------------------------------------------------
/images/nextpage.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pratik-choudhari/Financial-news-scraper/HEAD/images/nextpage.JPG


--------------------------------------------------------------------------------
/images/result.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pratik-choudhari/Financial-news-scraper/HEAD/images/result.JPG


--------------------------------------------------------------------------------
/images/README.md:
--------------------------------------------------------------------------------
1 | home.jpg - main news page
2 | 
3 | nextpage.jpg - links to next pages
4 | 
5 | result.jpg - Snapshot of result json file
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Financial-news-scraper
 2 | A scraper made using beautiful soup 4 in python. Tailor made for extracting news from moneycontrol.com. Issue pull request for different scrapers.
 3 | 
 4 | __The main page to start scraping from: https://www.moneycontrol.com/news/technical-call-221.html__
 5 | ![](images/home.JPG)
 6 | 
 7 | __The program scrapes news from next pages too by extracting website link in these buttons__
 8 | ![](images/nextpage.JPG)
 9 | 
10 | __Resulting JSON file includes heading, date and image link, indexed by page number__
11 | ![](images/result.JPG)
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Pratik Choudhari
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/moneycontrol_scrapper.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | import requests
 4 | import datetime
 5 | from tqdm import tqdm
 6 | from bs4 import BeautifulSoup
 7 | from collections import defaultdict
 8 | 
 9 | submission = defaultdict(list)
10 | #main url
11 | src_url = 'https://www.moneycontrol.com/news/technical-call-221.html'
12 | 
13 | #get next page links and call scrap() on each link
14 | def setup(url):
15 |     nextlinks = []
16 |     src_page = requests.get(url).text
17 |     src = BeautifulSoup(src_page, 'lxml')
18 | 
19 |     #ignore <a> with void js as href
20 |     anchors = src.find("div", attrs={"class": "pagenation"}).findAll(
21 |         'a', {'href': re.compile('^((?!void).)*$')})
22 |     nextlinks = [i.attrs['href'] for i in anchors]
23 |     for idx, link in enumerate(tqdm(nextlinks)):
24 |         scrap('https://www.moneycontrol.com'+link, idx)
25 | 
26 | #scraps passed page url 
27 | def scrap(url, idx):
28 |     src_page = requests.get(url).text
29 |     src = BeautifulSoup(src_page, 'lxml')
30 | 
31 |     span = src.find("ul", {"id": "cagetory"}).findAll('span')
32 |     img = src.find("ul", {"id": "cagetory"}).findAll('img')
33 | 
34 |     #<img> has alt text attr set as heading of news, therefore get img link and heading from same tag
35 |     imgs = [i.attrs['src'] for i in img]
36 |     titles = [i.attrs['alt'] for i in img]
37 |     date = [i.get_text() for i in span]
38 | 
39 |     #list of dicts as values and indexed by page number
40 |     submission[str(idx)].append({'title': titles})
41 |     submission[str(idx)].append({'date': date})
42 |     submission[str(idx)].append({'img_src': imgs})
43 | 
44 | #save data as json named by current date
45 | def json_dump(data):
46 |     date = datetime.date.today().strftime("%B %d, %Y")
47 |     with open('moneycontrol_'+str(date)+'.json', 'w') as outfile:
48 |         json.dump(submission, outfile)
49 | 
50 | setup(src_url)
51 | json_dump(submission)
52 | 
53 | 


--------------------------------------------------------------------------------