├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── .travis.yml ├── GoogleNews └── __init__.py ├── LICENSE ├── README.md ├── setup.py └── test ├── __init__.py ├── test_search.py └── test_sort.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: GitHub Action 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | GoogleNews.egg-info/ 3 | build/ 4 | dist/ 5 | .DS_Store 6 | /venv 7 | .idea 8 | *.html 9 | ._* 10 | .coverage 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | # command to install dependencies 5 | install: 6 | - pip install dateparser 7 | - pip install beautifulsoup4 8 | - pip install coverage 9 | - pip install coveralls 10 | # command to run tests 11 | script: 12 | - coverage run -m unittest discover 'test' 'test*.py' 13 | # coverage 14 | after_success: 15 | coveralls 16 | -------------------------------------------------------------------------------- /GoogleNews/__init__.py: -------------------------------------------------------------------------------- 1 | ### MODULES 2 | import re 3 | import urllib.request 4 | import dateparser, copy 5 | from bs4 import BeautifulSoup as Soup, ResultSet 6 | from dateutil.parser import parse 7 | 8 | import datetime 9 | from dateutil.relativedelta import relativedelta 10 | import logging 11 | ### METHODS 12 | 13 | def lexical_date_parser(date_to_check): 14 | if date_to_check=='': 15 | return ('',None) 16 | datetime_tmp=None 17 | date_tmp=copy.copy(date_to_check) 18 | try: 19 | date_tmp = date_tmp[date_tmp.rfind('..')+2:] 20 | datetime_tmp=dateparser.parse(date_tmp) 21 | except: 22 | date_tmp = None 23 | datetime_tmp = None 24 | 25 | if datetime_tmp==None: 26 | date_tmp=date_to_check 27 | else: 28 | datetime_tmp=datetime_tmp.replace(tzinfo=None) 29 | 30 | if date_tmp[0]==' ': 31 | date_tmp=date_tmp[1:] 32 | return date_tmp,datetime_tmp 33 | 34 | 35 | def define_date(date): 36 | months = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Sept':9,'Oct':10,'Nov':11,'Dec':12, '01':1, '02':2, '03':3, '04':4, '05':5, '06':6, '07':7, '08':8, '09':9, '10':10, '11':11, '12':12} 37 | try: 38 | if ' ago' in date.lower(): 39 | q = int(date.split()[-3]) 40 | if 'minutes' in date.lower() or 'mins' in date.lower(): 41 | return datetime.datetime.now() + relativedelta(minutes=-q) 42 | elif 'hour' in date.lower(): 43 | return datetime.datetime.now() + relativedelta(hours=-q) 44 | elif 'day' in date.lower(): 45 | return datetime.datetime.now() + relativedelta(days=-q) 46 | elif 'week' in date.lower(): 47 | return datetime.datetime.now() + relativedelta(days=-7*q) 48 | elif 'month' in date.lower(): 49 | return datetime.datetime.now() + relativedelta(months=-q) 50 | elif 'yesterday' in date.lower(): 51 | return datetime.datetime.now() + relativedelta(days=-1) 52 | else: 53 | date_list = date.replace('/',' ').split(' ') 54 | if len(date_list) == 2: 55 | date_list.append(datetime.datetime.now().year) 56 | elif len(date_list) == 3: 57 | if date_list[0] == '': 58 | date_list[0] = '1' 59 | return datetime.datetime(day=int(date_list[0]), month=months[date_list[1]], year=int(date_list[2])) 60 | except: 61 | return float('nan') 62 | 63 | 64 | ### CLASSEs 65 | 66 | class GoogleNews: 67 | 68 | def __init__(self,lang="en",period="",start="",end="",encode="utf-8",region=None): 69 | self.__texts = [] 70 | self.__links = [] 71 | self.__results = [] 72 | self.__totalcount = 0 73 | self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0' 74 | self.__lang = lang 75 | if region: 76 | self.accept_language= lang + '-' + region + ',' + lang + ';q=0.9' 77 | self.headers = {'User-Agent': self.user_agent, 'Accept-Language': self.accept_language} 78 | else: 79 | self.headers = {'User-Agent': self.user_agent} 80 | self.__period = period 81 | self.__start = start 82 | self.__end = end 83 | self.__encode = encode 84 | self.__exception = False 85 | self.__version = '1.6.15' 86 | self.__topic = None 87 | self.__section = None 88 | 89 | def getVersion(self): 90 | return self.__version 91 | 92 | def enableException(self, enable=True): 93 | self.__exception = enable 94 | 95 | def set_lang(self, lang): 96 | self.__lang = lang 97 | 98 | def setlang(self, lang): 99 | """Don't remove this, will affect old version user when upgrade""" 100 | self.set_lang(lang) 101 | 102 | def set_period(self, period): 103 | self.__period = period 104 | 105 | def setperiod(self, period): 106 | """Don't remove this, will affect old version user when upgrade""" 107 | self.set_period(period) 108 | 109 | def set_time_range(self, start, end): 110 | self.__start = start 111 | self.__end = end 112 | 113 | def setTimeRange(self, start, end): 114 | """Don't remove this, will affect old version user when upgrade""" 115 | self.set_time_range(start, end) 116 | 117 | def set_encode(self, encode): 118 | self.__encode = encode 119 | 120 | def set_topic(self, topic: str): 121 | self.__topic = topic 122 | 123 | def set_section(self, section: str): 124 | self.__section = section 125 | 126 | def setencode(self, encode): 127 | """Don't remove this, will affect old version user when upgrade""" 128 | self.set_encode(encode) 129 | 130 | def search(self, key): 131 | """ 132 | Searches for a term in google.com in the news section and retrieves the first page into __results. 133 | Parameters: 134 | key = the search term 135 | """ 136 | self.__key = key 137 | if self.__encode != "": 138 | self.__key = urllib.request.quote(self.__key.encode(self.__encode)) 139 | self.get_page() 140 | 141 | def build_response(self): 142 | self.req = urllib.request.Request(self.url.replace("search?","search?hl="+self.__lang+"&gl="+self.__lang+"&"), headers=self.headers) 143 | self.response = urllib.request.urlopen(self.req) 144 | self.page = self.response.read() 145 | self.content = Soup(self.page, "html.parser") 146 | stats = self.content.find_all("div", id="result-stats") 147 | if stats and isinstance(stats, ResultSet): 148 | stats = re.search(r'[\d,]+', stats[0].text) 149 | self.__totalcount = int(stats.group().replace(',', '')) 150 | else: 151 | #TODO might want to add output for user to know no data was found 152 | self.__totalcount = None 153 | logging.debug('Total count is not available when sort by date') 154 | result = self.content.find_all("a",attrs={'data-ved': True}) 155 | return result 156 | 157 | def remove_after_last_fullstop(self, s): 158 | # Find the last occurrence of the full stop 159 | last_period_index = s.rfind('.') 160 | # Slice the string up to the last full stop 161 | return s[:last_period_index+1] if last_period_index != -1 else s 162 | 163 | def page_at(self, page=1): 164 | """ 165 | Retrieves a specific page from google.com in the news sections into __results. 166 | Parameter: 167 | page = number of the page to be retrieved 168 | """ 169 | results = [] 170 | try: 171 | if self.__start != "" and self.__end != "": 172 | self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1))) 173 | elif self.__period != "": 174 | self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1))) 175 | else: 176 | self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1))) 177 | except AttributeError: 178 | raise AttributeError("You need to run a search() before using get_page().") 179 | try: 180 | result = self.build_response() 181 | for item in result: 182 | try: 183 | tmp_text = item.find("h3").text.replace("\n","") 184 | except Exception: 185 | tmp_text = '' 186 | try: 187 | tmp_link = item.get("href").replace('/url?esrc=s&q=&rct=j&sa=U&url=','') 188 | except Exception: 189 | tmp_link = '' 190 | try: 191 | tmp_media = item.find('div').find('div').find('div').find_next_sibling('div').text 192 | except Exception: 193 | tmp_media = '' 194 | try: 195 | tmp_date = item.find('div').find_next_sibling('div').find('span').text 196 | tmp_date,tmp_datetime=lexical_date_parser(tmp_date) 197 | except Exception: 198 | tmp_date = '' 199 | tmp_datetime=None 200 | try: 201 | tmp_desc = self.remove_after_last_fullstop(item.find('div').find_next_sibling('div').find('div').find_next_sibling('div').find('div').find('div').find('div').text).replace('\n','') 202 | except Exception: 203 | tmp_desc = '' 204 | try: 205 | tmp_img = item.find("img").get("src") 206 | except Exception: 207 | tmp_img = '' 208 | self.__texts.append(tmp_text) 209 | self.__links.append(tmp_link) 210 | results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img}) 211 | self.response.close() 212 | except Exception as e_parser: 213 | print(e_parser) 214 | if self.__exception: 215 | raise Exception(e_parser) 216 | else: 217 | pass 218 | return results 219 | 220 | def get_page(self, page=1): 221 | """ 222 | Retrieves a specific page from google.com in the news sections into __results. 223 | Parameter: 224 | page = number of the page to be retrieved 225 | """ 226 | try: 227 | if self.__start != "" and self.__end != "": 228 | self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1))) 229 | elif self.__period != "": 230 | self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1))) 231 | else: 232 | self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1))) 233 | except AttributeError: 234 | raise AttributeError("You need to run a search() before using get_page().") 235 | try: 236 | result = self.build_response() 237 | for item in result: 238 | try: 239 | tmp_text = item.find("h3").text.replace("\n","") 240 | except Exception: 241 | tmp_text = '' 242 | try: 243 | tmp_link = item.get("href").replace('/url?esrc=s&q=&rct=j&sa=U&url=','') 244 | except Exception: 245 | tmp_link = '' 246 | try: 247 | tmp_media = item.find('div').find('div').find('div').find_next_sibling('div').text 248 | except Exception: 249 | tmp_media = '' 250 | try: 251 | tmp_date = item.find('div').find_next_sibling('div').find('span').text 252 | tmp_date,tmp_datetime=lexical_date_parser(tmp_date) 253 | except Exception: 254 | tmp_date = '' 255 | tmp_datetime=None 256 | try: 257 | tmp_desc = self.remove_after_last_fullstop(item.find('div').find_next_sibling('div').find('div').find_next_sibling('div').find('div').find('div').find('div').text).replace('\n','') 258 | except Exception: 259 | tmp_desc = '' 260 | try: 261 | tmp_img = item.find("img").get("src") 262 | except Exception: 263 | tmp_img = '' 264 | self.__texts.append(tmp_text) 265 | self.__links.append(tmp_link) 266 | self.__results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img}) 267 | self.response.close() 268 | except Exception as e_parser: 269 | print(e_parser) 270 | if self.__exception: 271 | raise Exception(e_parser) 272 | else: 273 | pass 274 | 275 | def getpage(self, page=1): 276 | """Don't remove this, will affect old version user when upgrade""" 277 | self.get_page(page) 278 | 279 | def get_news(self, key="",deamplify=False): 280 | if key != '': 281 | if self.__period != "": 282 | key += f" when:{self.__period}" 283 | else: 284 | if self.__period != "": 285 | key += f"when:{self.__period}" 286 | key = urllib.request.quote(key.encode(self.__encode)) 287 | start = f'{self.__start[-4:]}-{self.__start[:2]}-{self.__start[3:5]}' 288 | end = f'{self.__end[-4:]}-{self.__end[:2]}-{self.__end[3:5]}' 289 | 290 | if self.__start == '' or self.__end == '': 291 | self.url = 'https://news.google.com/search?q={}&hl={}'.format( 292 | key, self.__lang.lower()) 293 | else: 294 | self.url = 'https://news.google.com/search?q={}+before:{}+after:{}&hl={}'.format( 295 | key, end, start, self.__lang.lower()) 296 | 297 | if self.__topic: 298 | self.url = 'https://news.google.com/topics/{}'.format( 299 | self.__topic) 300 | 301 | if self.__section: 302 | self.url = 'https://news.google.com/topics/{}/sections/{}'.format( 303 | self.__topic, self.__section) 304 | 305 | 306 | try: 307 | self.req = urllib.request.Request(self.url, headers=self.headers) 308 | self.response = urllib.request.urlopen(self.req) 309 | self.page = self.response.read() 310 | self.content = Soup(self.page, "html.parser") 311 | articles = self.content.select('article') 312 | for article in articles: 313 | try: 314 | # title 315 | try: 316 | title=article.findAll('div')[2].findAll('a')[0].text 317 | except: 318 | try: 319 | title=article.findAll('a')[1].text 320 | except: 321 | title=None 322 | # description 323 | try: 324 | desc=None 325 | except: 326 | desc=None 327 | # date 328 | try: 329 | date = article.find("time").text 330 | # date,datetime_tmp = lexial_date_parser(date) 331 | except: 332 | date = None 333 | # datetime 334 | try: 335 | datetime_chars=article.find('time').get('datetime') 336 | datetime_obj = parse(datetime_chars).replace(tzinfo=None) 337 | except: 338 | datetime_obj=None 339 | # link 340 | if deamplify: 341 | try: 342 | link = 'https://news.google.com/' + article.find('div').find("a").get("href")[2:] 343 | except Exception as deamp_e: 344 | print(deamp_e) 345 | link = article.find("article").get("jslog").split('2:')[1].split(';')[0] 346 | else: 347 | try: 348 | link = 'https://news.google.com/' + article.find('div').find("a").get("href")[2:] 349 | except Exception as deamp_e: 350 | print(deamp_e) 351 | link = None 352 | self.__texts.append(title) 353 | self.__links.append(link) 354 | if link.startswith('https://www.youtube.com/watch?v='): 355 | desc = 'video' 356 | # image 357 | try: 358 | img = 'https://news.google.com'+article.find("figure").find("img").get("src") 359 | except: 360 | img = None 361 | # site 362 | try: 363 | site=article.find("time").parent.find("a").text 364 | except: 365 | site=None 366 | try: 367 | media=article.find("div").findAll("div")[1].find("div").find("div").find("div").text 368 | except: 369 | try: 370 | media=article.findAll("div")[1].find("div").find("div").find("div").text 371 | except: 372 | media=None 373 | # reporter 374 | try: 375 | reporter = article.findAll('span')[2].text 376 | except: 377 | reporter = None 378 | # collection 379 | self.__results.append({'title':title, 380 | 'desc':desc, 381 | 'date':date, 382 | 'datetime':define_date(date), 383 | 'link':link, 384 | 'img':img, 385 | 'media':media, 386 | 'site':site, 387 | 'reporter':reporter}) 388 | except Exception as e_article: 389 | print(e_article) 390 | self.response.close() 391 | except Exception as e_parser: 392 | print(e_parser) 393 | if self.__exception: 394 | raise Exception(e_parser) 395 | else: 396 | pass 397 | 398 | def total_count(self): 399 | return self.__totalcount 400 | 401 | def result(self,sort=False): 402 | """Don't remove this, will affect old version user when upgrade""" 403 | return self.results(sort) 404 | 405 | def results(self,sort=False): 406 | """Returns the __results. 407 | New feature: include datatime and sort the articles in decreasing order""" 408 | results=self.__results 409 | if sort: 410 | try: 411 | results.sort(key = lambda x:x['datetime'],reverse=True) 412 | except Exception as e_sort: 413 | print(e_sort) 414 | if self.__exception: 415 | raise Exception(e_sort) 416 | else: 417 | pass 418 | results=self.__results 419 | return results 420 | 421 | def get_texts(self): 422 | """Returns only the __texts of the __results.""" 423 | return self.__texts 424 | 425 | def gettext(self): 426 | """Don't remove this, will affect old version user when upgrade""" 427 | return self.get_texts() 428 | 429 | def get_links(self): 430 | """Returns only the __links of the __results.""" 431 | return self.__links 432 | 433 | def clear(self): 434 | self.__texts = [] 435 | self.__links = [] 436 | self.__results = [] 437 | self.__totalcount = 0 438 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GoogleNews 2 | 3 | [![Build Status](https://app.travis-ci.com/Iceloof/GoogleNews.svg)](https://app.travis-ci.com/github/Iceloof/GoogleNews) 4 | [![Coverage Status](https://coveralls.io/repos/github/Iceloof/GoogleNews/badge.svg)](https://coveralls.io/github/Iceloof/GoogleNews) 5 | [![PyPI](https://img.shields.io/pypi/v/GoogleNews)](https://pypi.org/project/GoogleNews/) 6 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/GoogleNews)](https://pypistats.org/packages/googlenews) 7 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/GoogleNews) 8 | ![PyPI - Wheel](https://img.shields.io/pypi/wheel/GoogleNews) 9 | ![GitHub contributors](https://img.shields.io/github/contributors/Iceloof/GoogleNews) 10 | ![GitHub issues](https://img.shields.io/github/issues-raw/Iceloof/GoogleNews) 11 | ![GitHub Action](https://github.com/Iceloof/GoogleNews/workflows/GitHub%20Action/badge.svg) 12 | ![GitHub](https://img.shields.io/github/license/Iceloof/GoogleNews) 13 | 14 | ## Install 15 | ``` 16 | pip install GoogleNews 17 | ``` 18 | or 19 | ``` 20 | pip install --upgrade GoogleNews 21 | ``` 22 | ## Usage 23 | - Initializing 24 | ``` 25 | from GoogleNews import GoogleNews 26 | googlenews = GoogleNews() 27 | ``` 28 | - Check version 29 | ``` 30 | print(googlenews.getVersion()) 31 | ``` 32 | - Enable to throw exception 33 | ``` 34 | googlenews.enableException(True) 35 | ``` 36 | - Optional choose language 37 | ``` 38 | googlenews = GoogleNews(lang='en') 39 | ``` 40 | or 41 | ``` 42 | googlenews = GoogleNews(lang='en', region='US') 43 | ``` 44 | - Optional choose period (period and custom day range should not set together) 45 | ``` 46 | googlenews = GoogleNews(period='7d') 47 | ``` 48 | - Optional choose custom day range (mm/dd/yyyy) 49 | ``` 50 | googlenews = GoogleNews(start='02/01/2020',end='02/28/2020') 51 | ``` 52 | - Optional set encode 53 | ``` 54 | googlenews = GoogleNews(encode='utf-8') 55 | ``` 56 | or 57 | ``` 58 | googlenews.set_lang('en') 59 | googlenews.set_period('7d') 60 | googlenews.set_time_range('02/01/2020','02/28/2020') 61 | googlenews.set_encode('utf-8') 62 | ``` 63 | - **news.google.com** search sample 64 | ``` 65 | googlenews.get_news('APPLE') 66 | ``` 67 | - **news.google.com get news by topics 68 | ``` 69 | # Sports 70 | googlenews.set_topic('CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVUpTR2dKQ1VpZ0FQAQ') 71 | googlenews.get_news() 72 | ``` 73 | - **news.google.com get news by topic and sections 74 | ``` 75 | # Sports 76 | googlenews.set_topic('CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVUpTR2dKQ1VpZ0FQAQ') 77 | # Football Soccer 78 | googlenews.set_section('CAQiS0NCQVNNZ29JTDIwdk1EWnVkR29TQlhCMExVSlNHZ0pDVWlJT0NBUWFDZ29JTDIwdk1ESjJlRFFxQ3dvSkVnZEdkWFJsWW05c0tBQSouCAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVUpTR2dKQ1VpZ0FQAVAB') 79 | 80 | googlenews.get_news() 81 | ``` 82 | - **google.com** section news search sample 83 | ``` 84 | googlenews.search('APPLE') 85 | ``` 86 | 87 | Default return first page result, you don't need to get first page again, otherwise you might get duplicate result. To get other page of search results: 88 | 89 | ``` 90 | googlenews.get_page(2) 91 | ``` 92 | - If you only want to get specific page 93 | ``` 94 | result = googlenews.page_at(2) 95 | ``` 96 | - If you want to get the total result number of the search(this is approximate number, not exact number, it is the number showing on the google search page) (Note: this function is not available for `googlenews.search()`) 97 | ``` 98 | googlenews.total_count() 99 | ``` 100 | - Get results will return the list, `[{'title': '...', 'media': '...', 'date': '...', 'datetime': '...', 'desc': '...', 'link': '...', 'img': '...'}]` 101 | ``` 102 | googlenews.results() 103 | ``` 104 | if `googlenews.results(sort=True)` the tool will try to order the results in cronologically reversed order 105 | 106 | - Get texts will return the list of news titles 107 | ``` 108 | googlenews.get_texts() 109 | ``` 110 | - Get links returns the list of news links 111 | ``` 112 | googlenews.get_links() 113 | ``` 114 | - Clear result list before doing another search with the same object 115 | ``` 116 | googlenews.clear() 117 | ``` 118 | ## Issue 119 | Image is not working in the latest version, it can only return default google loading gif 120 | 121 | The date range is not always working as Google may return the result with random order or out of date range. 122 | 123 | Google may recognize the program as automated robots and block the IP, using cloud server and fetching data with high frequency will get higher chance to be blocked. 124 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="GoogleNews", 8 | version="1.6.15", 9 | author="Hurin Hu", 10 | author_email="hurin@live.ca", 11 | description="Google News search for Python", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/Iceloof/GoogleNews", 15 | packages=setuptools.find_packages(), 16 | install_requires=['beautifulsoup4','dateparser','python-dateutil'], 17 | classifiers=[ 18 | "Programming Language :: Python :: 3.6", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Iceloof/GoogleNews/875b3826e05e57515e1329d803d6c3af90e4ee51/test/__init__.py -------------------------------------------------------------------------------- /test/test_search.py: -------------------------------------------------------------------------------- 1 | 2 | ### MODULES 3 | 4 | import unittest 5 | from GoogleNews import GoogleNews 6 | 7 | ### TEST 8 | 9 | keyword = 'Apple' 10 | 11 | class NumbersTest(unittest.TestCase): 12 | 13 | def testResultNumberWithDefaultPage(self): 14 | googlenews = GoogleNews() 15 | googlenews.search(keyword) 16 | length = len(googlenews.result()) 17 | self.assertNotEqual(length, 0) 18 | print('Result length with only one page is correct') 19 | 20 | def testResultNumberWithTwoPages(self): 21 | googlenews = GoogleNews() 22 | googlenews.search(keyword) 23 | googlenews.get_page(2) 24 | length = len(googlenews.result()) 25 | self.assertNotEqual(length, 0) 26 | print('Result length with two pages is correct') 27 | 28 | def testEncode(self): 29 | googlenews = GoogleNews(lang='ru',encode='utf-8') 30 | googlenews.search("Моцарт") 31 | length = len(googlenews.result()) 32 | self.assertNotEqual(length, 0) 33 | print('Encoding result is not empty') 34 | 35 | # def testTotalCountGreaterThanZero(self): 36 | # googlenews = GoogleNews() 37 | # googlenews.search(keyword) 38 | # count = googlenews.total_count() 39 | # self.assertGreater(count, 0) 40 | # print('Total count is greater than zero') 41 | 42 | def testResultNumberAtTwoPages(self): 43 | googlenews = GoogleNews() 44 | googlenews.search(keyword) 45 | result = googlenews.page_at(2) 46 | length = len(result) 47 | self.assertNotEqual(length, 0) 48 | print('Result length at two pages is correct') 49 | 50 | class TestStringMethods(unittest.TestCase): 51 | 52 | def testVersion(self): 53 | googlenews = GoogleNews() 54 | version = '1.6.15' 55 | self.assertIn(version, googlenews.getVersion()) 56 | print('Latest version matched') 57 | 58 | def testResultContainsKeyword(self): 59 | googlenews = GoogleNews() 60 | googlenews.search(keyword) 61 | result = googlenews.result()[0] 62 | print(result.get('title').lower()+result.get('desc').lower()) 63 | self.assertIn(keyword.lower(), result.get('title').lower()+result.get('desc').lower()) 64 | print('Result contains keyword') 65 | 66 | def testResultHasLink(self): 67 | googlenews = GoogleNews() 68 | googlenews.search(keyword) 69 | result = googlenews.result()[0] 70 | print(result.get('link').lower()) 71 | self.assertIn('http', result.get('link').lower()) 72 | print('Result contains http link') 73 | 74 | def testResultHasImage(self): 75 | googlenews = GoogleNews() 76 | googlenews.search(keyword) 77 | result = googlenews.result()[0] 78 | print(result.get('img').lower()) 79 | self.assertIn('base64', result.get('img').lower()) 80 | print('Result contains image') 81 | 82 | def testResultHasTitle(self): 83 | googlenews = GoogleNews() 84 | googlenews.search(keyword) 85 | result = googlenews.result()[0] 86 | print(result.get('title').lower()) 87 | self.assertIsNot('', result.get('title').lower()) 88 | print('Result title is not empty') 89 | 90 | def testResultHasMedia(self): 91 | googlenews = GoogleNews() 92 | googlenews.search(keyword) 93 | result = googlenews.result()[0] 94 | print(result.get('media').lower()) 95 | self.assertIsNot('', result.get('media').lower()) 96 | print('Result media is not empty') 97 | 98 | def testResultHasDate(self): 99 | googlenews = GoogleNews() 100 | googlenews.search(keyword) 101 | result = googlenews.result()[0] 102 | print(result.get('date').lower()) 103 | self.assertIsNot('', result.get('date').lower()) 104 | print('Result date is not empty') 105 | 106 | ### MAIN 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | -------------------------------------------------------------------------------- /test/test_sort.py: -------------------------------------------------------------------------------- 1 | 2 | ### MODULES 3 | 4 | from GoogleNews import GoogleNews 5 | 6 | ### METHODS 7 | 8 | def show_routine(results): 9 | for num,page in enumerate(results): 10 | print(f"{num}. {page['date']} - {page['title']}") 11 | 12 | ### MAIN 13 | 14 | # Setup the research 15 | keywords="covid cava de' tirreni" 16 | period='10d' 17 | google_news = GoogleNews(lang='it',period=period) 18 | google=GoogleNews(lang='it',period=period) 19 | 20 | # Results from news.google.com 21 | google_news.get_news(keywords) 22 | results_gnews=google_news.results(sort=True) 23 | show_routine(results_gnews) 24 | 25 | # Results from google.com 26 | google.search(keywords) 27 | results_google=google.results(sort=True) 28 | show_routine(results_google) --------------------------------------------------------------------------------