├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── .travis.yml
├── GoogleNews
    └── __init__.py
├── LICENSE
├── README.md
├── setup.py
└── test
    ├── __init__.py
    ├── test_search.py
    └── test_sort.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: GitHub Action
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | GoogleNews.egg-info/
 3 | build/
 4 | dist/
 5 | .DS_Store
 6 | /venv
 7 | .idea
 8 | *.html
 9 | ._*
10 | .coverage
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | # command to install dependencies
 5 | install:
 6 |   - pip install dateparser
 7 |   - pip install beautifulsoup4
 8 |   - pip install coverage
 9 |   - pip install coveralls
10 | # command to run tests
11 | script:
12 |   - coverage run -m unittest discover 'test' 'test*.py'
13 | # coverage
14 | after_success:
15 |   coveralls
16 | 


--------------------------------------------------------------------------------
/GoogleNews/__init__.py:
--------------------------------------------------------------------------------
  1 | ### MODULES
  2 | import re
  3 | import urllib.request
  4 | import dateparser, copy
  5 | from bs4 import BeautifulSoup as Soup, ResultSet
  6 | from dateutil.parser import parse
  7 | 
  8 | import datetime
  9 | from dateutil.relativedelta import relativedelta
 10 | import logging
 11 | ### METHODS
 12 | 
 13 | def lexical_date_parser(date_to_check):
 14 |     if date_to_check=='':
 15 |         return ('',None)
 16 |     datetime_tmp=None
 17 |     date_tmp=copy.copy(date_to_check)
 18 |     try:
 19 |         date_tmp = date_tmp[date_tmp.rfind('..')+2:]
 20 |         datetime_tmp=dateparser.parse(date_tmp)
 21 |     except:
 22 |         date_tmp = None
 23 |         datetime_tmp = None
 24 | 
 25 |     if datetime_tmp==None:
 26 |         date_tmp=date_to_check
 27 |     else:
 28 |         datetime_tmp=datetime_tmp.replace(tzinfo=None)
 29 | 
 30 |     if date_tmp[0]==' ':
 31 |         date_tmp=date_tmp[1:]
 32 |     return date_tmp,datetime_tmp
 33 | 
 34 | 
 35 | def define_date(date):
 36 |     months = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Sept':9,'Oct':10,'Nov':11,'Dec':12, '01':1, '02':2, '03':3, '04':4, '05':5, '06':6, '07':7, '08':8, '09':9, '10':10, '11':11, '12':12}
 37 |     try:
 38 |         if ' ago' in date.lower():
 39 |             q = int(date.split()[-3])
 40 |             if 'minutes' in date.lower() or 'mins' in date.lower():
 41 |                 return datetime.datetime.now() + relativedelta(minutes=-q)
 42 |             elif 'hour' in date.lower():
 43 |                 return datetime.datetime.now() + relativedelta(hours=-q)
 44 |             elif 'day' in date.lower():
 45 |                 return datetime.datetime.now() + relativedelta(days=-q)
 46 |             elif 'week' in date.lower():
 47 |                 return datetime.datetime.now() + relativedelta(days=-7*q)
 48 |             elif 'month' in date.lower():
 49 |                 return datetime.datetime.now() + relativedelta(months=-q)
 50 |         elif 'yesterday' in date.lower():
 51 |             return datetime.datetime.now() + relativedelta(days=-1)
 52 |         else:
 53 |             date_list = date.replace('/',' ').split(' ')
 54 |             if len(date_list) == 2:
 55 |                 date_list.append(datetime.datetime.now().year)
 56 |             elif len(date_list) == 3:
 57 |                 if date_list[0] == '':
 58 |                     date_list[0] = '1'
 59 |             return datetime.datetime(day=int(date_list[0]), month=months[date_list[1]], year=int(date_list[2]))
 60 |     except:
 61 |         return float('nan')
 62 | 
 63 | 
 64 | ### CLASSEs
 65 | 
 66 | class GoogleNews:
 67 | 
 68 |     def __init__(self,lang="en",period="",start="",end="",encode="utf-8",region=None):
 69 |         self.__texts = []
 70 |         self.__links = []
 71 |         self.__results = []
 72 |         self.__totalcount = 0
 73 |         self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
 74 |         self.__lang = lang
 75 |         if region:
 76 |             self.accept_language= lang + '-' + region + ',' + lang + ';q=0.9'
 77 |             self.headers = {'User-Agent': self.user_agent, 'Accept-Language': self.accept_language}
 78 |         else:
 79 |             self.headers = {'User-Agent': self.user_agent}
 80 |         self.__period = period
 81 |         self.__start = start
 82 |         self.__end = end
 83 |         self.__encode = encode
 84 |         self.__exception = False
 85 |         self.__version = '1.6.15'
 86 |         self.__topic = None
 87 |         self.__section = None
 88 | 
 89 |     def getVersion(self):
 90 |         return self.__version
 91 |     
 92 |     def enableException(self, enable=True):
 93 |         self.__exception = enable
 94 | 
 95 |     def set_lang(self, lang):
 96 |         self.__lang = lang
 97 | 
 98 |     def setlang(self, lang):
 99 |         """Don't remove this, will affect old version user when upgrade"""
100 |         self.set_lang(lang)
101 | 
102 |     def set_period(self, period):
103 |         self.__period = period
104 | 
105 |     def setperiod(self, period):
106 |         """Don't remove this, will affect old version user when upgrade"""
107 |         self.set_period(period)
108 | 
109 |     def set_time_range(self, start, end):
110 |         self.__start = start
111 |         self.__end = end
112 | 
113 |     def setTimeRange(self, start, end):
114 |         """Don't remove this, will affect old version user when upgrade"""
115 |         self.set_time_range(start, end)
116 | 
117 |     def set_encode(self, encode):
118 |         self.__encode = encode
119 | 
120 |     def set_topic(self, topic: str):
121 |         self.__topic = topic
122 |         
123 |     def set_section(self, section: str):
124 |         self.__section = section
125 |         
126 |     def setencode(self, encode):
127 |         """Don't remove this, will affect old version user when upgrade"""
128 |         self.set_encode(encode)
129 | 
130 |     def search(self, key):
131 |         """
132 |         Searches for a term in google.com in the news section and retrieves the first page into __results.
133 |         Parameters:
134 |         key = the search term
135 |         """
136 |         self.__key = key
137 |         if self.__encode != "":
138 |             self.__key = urllib.request.quote(self.__key.encode(self.__encode))
139 |         self.get_page()
140 | 
141 |     def build_response(self):
142 |         self.req = urllib.request.Request(self.url.replace("search?","search?hl="+self.__lang+"&gl="+self.__lang+"&"), headers=self.headers)
143 |         self.response = urllib.request.urlopen(self.req)
144 |         self.page = self.response.read()
145 |         self.content = Soup(self.page, "html.parser")
146 |         stats = self.content.find_all("div", id="result-stats")
147 |         if stats and isinstance(stats, ResultSet):
148 |             stats = re.search(r'[\d,]+', stats[0].text)
149 |             self.__totalcount = int(stats.group().replace(',', ''))
150 |         else:
151 |             #TODO might want to add output for user to know no data was found
152 |             self.__totalcount = None
153 |             logging.debug('Total count is not available when sort by date')
154 |         result = self.content.find_all("a",attrs={'data-ved': True})
155 |         return result
156 | 
157 |     def remove_after_last_fullstop(self, s):
158 |         # Find the last occurrence of the full stop
159 |         last_period_index = s.rfind('.')
160 |         # Slice the string up to the last full stop
161 |         return s[:last_period_index+1] if last_period_index != -1 else s
162 |     
163 |     def page_at(self, page=1):
164 |         """
165 |         Retrieves a specific page from google.com in the news sections into __results.
166 |         Parameter:
167 |         page = number of the page to be retrieved
168 |         """
169 |         results = []
170 |         try:
171 |             if self.__start != "" and self.__end != "":
172 |                 self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1)))
173 |             elif self.__period != "":
174 |                 self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1))) 
175 |             else:
176 |                 self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1))) 
177 |         except AttributeError:
178 |             raise AttributeError("You need to run a search() before using get_page().")
179 |         try:
180 |             result = self.build_response()
181 |             for item in result:
182 |                 try:
183 |                     tmp_text = item.find("h3").text.replace("\n","")
184 |                 except Exception:
185 |                     tmp_text = ''
186 |                 try:
187 |                     tmp_link = item.get("href").replace('/url?esrc=s&q=&rct=j&sa=U&url=','')
188 |                 except Exception:
189 |                     tmp_link = ''
190 |                 try:
191 |                     tmp_media = item.find('div').find('div').find('div').find_next_sibling('div').text
192 |                 except Exception:
193 |                     tmp_media = ''
194 |                 try:
195 |                     tmp_date = item.find('div').find_next_sibling('div').find('span').text
196 |                     tmp_date,tmp_datetime=lexical_date_parser(tmp_date)
197 |                 except Exception:
198 |                     tmp_date = ''
199 |                     tmp_datetime=None
200 |                 try:
201 |                     tmp_desc = self.remove_after_last_fullstop(item.find('div').find_next_sibling('div').find('div').find_next_sibling('div').find('div').find('div').find('div').text).replace('\n','')
202 |                 except Exception:
203 |                     tmp_desc = ''
204 |                 try:
205 |                     tmp_img = item.find("img").get("src")
206 |                 except Exception:
207 |                     tmp_img = ''
208 |                 self.__texts.append(tmp_text)
209 |                 self.__links.append(tmp_link)
210 |                 results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
211 |             self.response.close()
212 |         except Exception as e_parser:
213 |             print(e_parser)
214 |             if self.__exception:
215 |                 raise Exception(e_parser)
216 |             else:
217 |                 pass
218 |         return results
219 | 
220 |     def get_page(self, page=1):
221 |         """
222 |         Retrieves a specific page from google.com in the news sections into __results.
223 |         Parameter:
224 |         page = number of the page to be retrieved 
225 |         """
226 |         try:
227 |             if self.__start != "" and self.__end != "":
228 |                 self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1)))
229 |             elif self.__period != "":
230 |                 self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1))) 
231 |             else:
232 |                 self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1))) 
233 |         except AttributeError:
234 |             raise AttributeError("You need to run a search() before using get_page().")
235 |         try:
236 |             result = self.build_response()
237 |             for item in result:
238 |                 try:
239 |                     tmp_text = item.find("h3").text.replace("\n","")
240 |                 except Exception:
241 |                     tmp_text = ''
242 |                 try:
243 |                     tmp_link = item.get("href").replace('/url?esrc=s&q=&rct=j&sa=U&url=','')
244 |                 except Exception:
245 |                     tmp_link = ''
246 |                 try:
247 |                     tmp_media = item.find('div').find('div').find('div').find_next_sibling('div').text
248 |                 except Exception:
249 |                     tmp_media = ''
250 |                 try:
251 |                     tmp_date = item.find('div').find_next_sibling('div').find('span').text
252 |                     tmp_date,tmp_datetime=lexical_date_parser(tmp_date)
253 |                 except Exception:
254 |                     tmp_date = ''
255 |                     tmp_datetime=None
256 |                 try:
257 |                     tmp_desc = self.remove_after_last_fullstop(item.find('div').find_next_sibling('div').find('div').find_next_sibling('div').find('div').find('div').find('div').text).replace('\n','')
258 |                 except Exception:
259 |                     tmp_desc = ''
260 |                 try:
261 |                     tmp_img = item.find("img").get("src")
262 |                 except Exception:
263 |                     tmp_img = ''
264 |                 self.__texts.append(tmp_text)
265 |                 self.__links.append(tmp_link)
266 |                 self.__results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
267 |             self.response.close()
268 |         except Exception as e_parser:
269 |             print(e_parser)
270 |             if self.__exception:
271 |                 raise Exception(e_parser)
272 |             else:
273 |                 pass
274 | 
275 |     def getpage(self, page=1):
276 |         """Don't remove this, will affect old version user when upgrade"""
277 |         self.get_page(page)
278 | 
279 |     def get_news(self, key="",deamplify=False):
280 |         if key != '':
281 |             if self.__period != "":
282 |                 key += f" when:{self.__period}"
283 |         else:
284 |             if self.__period != "":
285 |                 key += f"when:{self.__period}"
286 |         key = urllib.request.quote(key.encode(self.__encode))
287 |         start = f'{self.__start[-4:]}-{self.__start[:2]}-{self.__start[3:5]}'
288 |         end = f'{self.__end[-4:]}-{self.__end[:2]}-{self.__end[3:5]}'
289 |         
290 |         if self.__start == '' or self.__end == '':
291 |             self.url = 'https://news.google.com/search?q={}&hl={}'.format(
292 |                 key, self.__lang.lower())
293 |         else:
294 |             self.url = 'https://news.google.com/search?q={}+before:{}+after:{}&hl={}'.format(
295 |                 key, end, start, self.__lang.lower())
296 |         
297 |         if self.__topic:
298 |             self.url = 'https://news.google.com/topics/{}'.format(
299 |                 self.__topic)
300 |             
301 |             if self.__section:
302 |                 self.url = 'https://news.google.com/topics/{}/sections/{}'.format(
303 |                 self.__topic, self.__section)
304 |                 
305 |             
306 |         try:
307 |             self.req = urllib.request.Request(self.url, headers=self.headers)
308 |             self.response = urllib.request.urlopen(self.req)
309 |             self.page = self.response.read()
310 |             self.content = Soup(self.page, "html.parser")
311 |             articles = self.content.select('article')
312 |             for article in articles:
313 |                 try:
314 |                     # title
315 |                     try:
316 |                         title=article.findAll('div')[2].findAll('a')[0].text
317 |                     except:
318 |                         try:
319 |                             title=article.findAll('a')[1].text
320 |                         except:
321 |                             title=None
322 |                     # description
323 |                     try:
324 |                         desc=None
325 |                     except:
326 |                         desc=None
327 |                     # date
328 |                     try:
329 |                         date = article.find("time").text
330 |                         # date,datetime_tmp = lexial_date_parser(date)
331 |                     except:
332 |                         date = None
333 |                     # datetime
334 |                     try:
335 |                         datetime_chars=article.find('time').get('datetime')
336 |                         datetime_obj = parse(datetime_chars).replace(tzinfo=None)
337 |                     except:
338 |                         datetime_obj=None
339 |                     # link
340 |                     if deamplify:
341 |                         try:
342 |                             link = 'https://news.google.com/' + article.find('div').find("a").get("href")[2:]
343 |                         except Exception as deamp_e:
344 |                             print(deamp_e)
345 |                             link = article.find("article").get("jslog").split('2:')[1].split(';')[0]
346 |                     else:
347 |                         try:
348 |                             link = 'https://news.google.com/' + article.find('div').find("a").get("href")[2:]
349 |                         except Exception as deamp_e:
350 |                             print(deamp_e)
351 |                             link = None
352 |                     self.__texts.append(title)
353 |                     self.__links.append(link)
354 |                     if link.startswith('https://www.youtube.com/watch?v='):
355 |                         desc = 'video'
356 |                     # image
357 |                     try:
358 |                         img = 'https://news.google.com'+article.find("figure").find("img").get("src")
359 |                     except:
360 |                         img = None
361 |                     # site
362 |                     try:
363 |                         site=article.find("time").parent.find("a").text
364 |                     except:
365 |                         site=None
366 |                     try:
367 |                         media=article.find("div").findAll("div")[1].find("div").find("div").find("div").text
368 |                     except:
369 |                         try:
370 |                             media=article.findAll("div")[1].find("div").find("div").find("div").text
371 |                         except:
372 |                             media=None
373 |                     # reporter
374 |                     try:
375 |                         reporter = article.findAll('span')[2].text
376 |                     except:
377 |                         reporter = None
378 |                     # collection
379 |                     self.__results.append({'title':title,
380 |                                            'desc':desc,
381 |                                            'date':date,
382 |                                            'datetime':define_date(date),
383 |                                            'link':link,
384 |                                            'img':img,
385 |                                            'media':media,
386 |                                            'site':site,
387 |                                            'reporter':reporter})
388 |                 except Exception as e_article:
389 |                     print(e_article)
390 |             self.response.close()
391 |         except Exception as e_parser:
392 |             print(e_parser)
393 |             if self.__exception:
394 |                 raise Exception(e_parser)
395 |             else:
396 |                 pass
397 | 
398 |     def total_count(self):
399 |         return self.__totalcount
400 | 
401 |     def result(self,sort=False):
402 |         """Don't remove this, will affect old version user when upgrade"""
403 |         return self.results(sort)
404 | 
405 |     def results(self,sort=False):
406 |         """Returns the __results.
407 |         New feature: include datatime and sort the articles in decreasing order"""
408 |         results=self.__results
409 |         if sort:
410 |             try:
411 |                 results.sort(key = lambda x:x['datetime'],reverse=True)
412 |             except Exception as e_sort:
413 |                 print(e_sort)
414 |                 if self.__exception:
415 |                     raise Exception(e_sort)
416 |                 else:
417 |                     pass
418 |                 results=self.__results
419 |         return results
420 | 
421 |     def get_texts(self):
422 |         """Returns only the __texts of the __results."""
423 |         return self.__texts
424 | 
425 |     def gettext(self):
426 |         """Don't remove this, will affect old version user when upgrade"""
427 |         return self.get_texts()
428 | 
429 |     def get_links(self):
430 |         """Returns only the __links of the __results."""
431 |         return self.__links
432 | 
433 |     def clear(self):
434 |         self.__texts = []
435 |         self.__links = []
436 |         self.__results = []
437 |         self.__totalcount = 0
438 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GoogleNews
  2 | 
  3 | [![Build Status](https://app.travis-ci.com/Iceloof/GoogleNews.svg)](https://app.travis-ci.com/github/Iceloof/GoogleNews)
  4 | [![Coverage Status](https://coveralls.io/repos/github/Iceloof/GoogleNews/badge.svg)](https://coveralls.io/github/Iceloof/GoogleNews)
  5 | [![PyPI](https://img.shields.io/pypi/v/GoogleNews)](https://pypi.org/project/GoogleNews/)
  6 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/GoogleNews)](https://pypistats.org/packages/googlenews)
  7 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/GoogleNews)
  8 | ![PyPI - Wheel](https://img.shields.io/pypi/wheel/GoogleNews)
  9 | ![GitHub contributors](https://img.shields.io/github/contributors/Iceloof/GoogleNews)
 10 | ![GitHub issues](https://img.shields.io/github/issues-raw/Iceloof/GoogleNews)
 11 | ![GitHub Action](https://github.com/Iceloof/GoogleNews/workflows/GitHub%20Action/badge.svg)
 12 | ![GitHub](https://img.shields.io/github/license/Iceloof/GoogleNews)
 13 | 
 14 | ## Install
 15 | ```
 16 | pip install GoogleNews
 17 | ```
 18 | or
 19 | ```
 20 | pip install --upgrade GoogleNews
 21 | ```
 22 | ## Usage
 23 | - Initializing
 24 | ```
 25 | from GoogleNews import GoogleNews
 26 | googlenews = GoogleNews()
 27 | ```
 28 | - Check version
 29 | ```
 30 | print(googlenews.getVersion())
 31 | ```
 32 | - Enable to throw exception
 33 | ```
 34 | googlenews.enableException(True)
 35 | ```
 36 | - Optional choose language
 37 | ```
 38 | googlenews = GoogleNews(lang='en')
 39 | ```
 40 | or
 41 | ```
 42 | googlenews = GoogleNews(lang='en', region='US')
 43 | ```
 44 | - Optional choose period (period and custom day range should not set together)
 45 | ```
 46 | googlenews = GoogleNews(period='7d')
 47 | ```
 48 | - Optional choose custom day range (mm/dd/yyyy)
 49 | ```
 50 | googlenews = GoogleNews(start='02/01/2020',end='02/28/2020')
 51 | ```
 52 | - Optional set encode
 53 | ```
 54 | googlenews = GoogleNews(encode='utf-8')
 55 | ```
 56 | or
 57 | ```
 58 | googlenews.set_lang('en')
 59 | googlenews.set_period('7d')
 60 | googlenews.set_time_range('02/01/2020','02/28/2020')
 61 | googlenews.set_encode('utf-8')
 62 | ```
 63 | - **news.google.com** search sample
 64 | ```
 65 | googlenews.get_news('APPLE')
 66 | ```
 67 | - **news.google.com get news by topics
 68 | ```
 69 | # Sports
 70 | googlenews.set_topic('CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVUpTR2dKQ1VpZ0FQAQ')
 71 | googlenews.get_news()
 72 | ```
 73 | - **news.google.com get news by topic and sections
 74 | ```
 75 | # Sports
 76 | googlenews.set_topic('CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVUpTR2dKQ1VpZ0FQAQ')
 77 | # Football Soccer
 78 | googlenews.set_section('CAQiS0NCQVNNZ29JTDIwdk1EWnVkR29TQlhCMExVSlNHZ0pDVWlJT0NBUWFDZ29JTDIwdk1ESjJlRFFxQ3dvSkVnZEdkWFJsWW05c0tBQSouCAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVUpTR2dKQ1VpZ0FQAVAB')
 79 | 
 80 | googlenews.get_news()
 81 | ```
 82 | - **google.com** section news search sample
 83 | ```
 84 | googlenews.search('APPLE')
 85 | ```
 86 | 
 87 | Default return first page result, you don't need to get first page again, otherwise you might get duplicate result. To get other page of search results:
 88 | 
 89 | ```
 90 | googlenews.get_page(2)
 91 | ```
 92 | - If you only want to get specific page
 93 | ```
 94 | result = googlenews.page_at(2)
 95 | ```
 96 | - If you want to get the total result number of the search(this is approximate number, not exact number, it is the number showing on the google search page) (Note: this function is not available for `googlenews.search()`)
 97 | ```
 98 | googlenews.total_count()
 99 | ```
100 | - Get results will return the list, `[{'title': '...', 'media': '...', 'date': '...', 'datetime': '...', 'desc': '...', 'link': '...', 'img': '...'}]`
101 | ```
102 | googlenews.results()
103 | ```
104 | if `googlenews.results(sort=True)` the tool will try to order the results in cronologically reversed order
105 | 
106 | - Get texts will return the list of news titles
107 | ```
108 | googlenews.get_texts()
109 | ```
110 | - Get links returns the list of news links
111 | ```
112 | googlenews.get_links()
113 | ```
114 | - Clear result list before doing another search with the same object
115 | ```
116 | googlenews.clear()
117 | ```
118 | ## Issue
119 | Image is not working in the latest version, it can only return default google loading gif
120 | 
121 | The date range is not always working as Google may return the result with random order or out of date range.
122 | 
123 | Google may recognize the program as automated robots and block the IP, using cloud server and fetching data with high frequency will get higher chance to be blocked. 
124 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="GoogleNews",
 8 |     version="1.6.15",
 9 |     author="Hurin Hu",
10 |     author_email="hurin@live.ca",
11 |     description="Google News search for Python",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/Iceloof/GoogleNews",
15 |     packages=setuptools.find_packages(),
16 |     install_requires=['beautifulsoup4','dateparser','python-dateutil'],
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3.6",
19 |         "License :: OSI Approved :: MIT License",
20 |         "Operating System :: OS Independent",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Iceloof/GoogleNews/875b3826e05e57515e1329d803d6c3af90e4ee51/test/__init__.py


--------------------------------------------------------------------------------
/test/test_search.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ### MODULES
  3 | 
  4 | import unittest
  5 | from GoogleNews import GoogleNews
  6 | 
  7 | ### TEST
  8 | 
  9 | keyword = 'Apple'
 10 | 
 11 | class NumbersTest(unittest.TestCase):
 12 | 
 13 |   def testResultNumberWithDefaultPage(self):
 14 |     googlenews = GoogleNews()
 15 |     googlenews.search(keyword)
 16 |     length = len(googlenews.result())
 17 |     self.assertNotEqual(length, 0)
 18 |     print('Result length with only one page is correct')
 19 | 
 20 |   def testResultNumberWithTwoPages(self):
 21 |     googlenews = GoogleNews()
 22 |     googlenews.search(keyword)
 23 |     googlenews.get_page(2)
 24 |     length = len(googlenews.result())
 25 |     self.assertNotEqual(length, 0)
 26 |     print('Result length with two pages is correct')
 27 | 
 28 |   def testEncode(self):
 29 |     googlenews = GoogleNews(lang='ru',encode='utf-8')
 30 |     googlenews.search("Моцарт")
 31 |     length = len(googlenews.result())
 32 |     self.assertNotEqual(length, 0)
 33 |     print('Encoding result is not empty')
 34 | 
 35 |   # def testTotalCountGreaterThanZero(self):
 36 |   #   googlenews = GoogleNews()
 37 |   #   googlenews.search(keyword)
 38 |   #   count = googlenews.total_count()
 39 |   #   self.assertGreater(count, 0)
 40 |   #   print('Total count is greater than zero')
 41 | 
 42 |   def testResultNumberAtTwoPages(self):
 43 |     googlenews = GoogleNews()
 44 |     googlenews.search(keyword)
 45 |     result = googlenews.page_at(2)
 46 |     length = len(result)
 47 |     self.assertNotEqual(length, 0)
 48 |     print('Result length at two pages is correct')
 49 | 
 50 | class TestStringMethods(unittest.TestCase):
 51 | 
 52 |   def testVersion(self):
 53 |     googlenews = GoogleNews()
 54 |     version = '1.6.15'
 55 |     self.assertIn(version, googlenews.getVersion())
 56 |     print('Latest version matched')
 57 |     
 58 |   def testResultContainsKeyword(self):
 59 |     googlenews = GoogleNews()
 60 |     googlenews.search(keyword)
 61 |     result = googlenews.result()[0]
 62 |     print(result.get('title').lower()+result.get('desc').lower())
 63 |     self.assertIn(keyword.lower(), result.get('title').lower()+result.get('desc').lower())
 64 |     print('Result contains keyword')
 65 | 
 66 |   def testResultHasLink(self):
 67 |     googlenews = GoogleNews()
 68 |     googlenews.search(keyword)
 69 |     result = googlenews.result()[0]
 70 |     print(result.get('link').lower())
 71 |     self.assertIn('http', result.get('link').lower())
 72 |     print('Result contains http link')
 73 | 
 74 |   def testResultHasImage(self):
 75 |     googlenews = GoogleNews()
 76 |     googlenews.search(keyword)
 77 |     result = googlenews.result()[0]
 78 |     print(result.get('img').lower())
 79 |     self.assertIn('base64', result.get('img').lower())
 80 |     print('Result contains image')
 81 | 
 82 |   def testResultHasTitle(self):
 83 |     googlenews = GoogleNews()
 84 |     googlenews.search(keyword)
 85 |     result = googlenews.result()[0]
 86 |     print(result.get('title').lower())
 87 |     self.assertIsNot('', result.get('title').lower())
 88 |     print('Result title is not empty')
 89 | 
 90 |   def testResultHasMedia(self):
 91 |     googlenews = GoogleNews()
 92 |     googlenews.search(keyword)
 93 |     result = googlenews.result()[0]
 94 |     print(result.get('media').lower())
 95 |     self.assertIsNot('', result.get('media').lower())
 96 |     print('Result media is not empty')
 97 | 
 98 |   def testResultHasDate(self):
 99 |     googlenews = GoogleNews()
100 |     googlenews.search(keyword)
101 |     result = googlenews.result()[0]
102 |     print(result.get('date').lower())
103 |     self.assertIsNot('', result.get('date').lower())
104 |     print('Result date is not empty')
105 | 
106 | ### MAIN
107 | 
108 | if __name__ == '__main__':
109 |   unittest.main()
110 | 


--------------------------------------------------------------------------------
/test/test_sort.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ### MODULES
 3 | 
 4 | from GoogleNews import GoogleNews
 5 | 
 6 | ### METHODS
 7 | 
 8 | def show_routine(results):
 9 |     for num,page in enumerate(results):
10 |         print(f"{num}. {page['date']} - {page['title']}")
11 |         
12 | ### MAIN
13 | 
14 | # Setup the research
15 | keywords="covid cava de' tirreni"
16 | period='10d'
17 | google_news = GoogleNews(lang='it',period=period)
18 | google=GoogleNews(lang='it',period=period)
19 | 
20 | # Results from news.google.com
21 | google_news.get_news(keywords)
22 | results_gnews=google_news.results(sort=True)
23 | show_routine(results_gnews)
24 | 
25 | # Results from google.com
26 | google.search(keywords)
27 | results_google=google.results(sort=True)
28 | show_routine(results_google)


--------------------------------------------------------------------------------